change utils.http.request to best-effort detect on-page encoding

closes #113
author: jesopo 2019-09-09 14:10:58 +0100
committer: jesopo 2019-09-09 14:11:18 +0100
commit: ff9c82bf671c31561231d0290f5ddaf0a98a088d (patch)
tree: 1afd6fc241a52dc1d7e3d21fad843a1f49ab7c4a
parent: remove redundant (and maybe wrong) GET params from webfinger call (diff)
1 files changed, 34 insertions, 2 deletions
diff --git a/src/utils/http.py b/src/utils/http.py
index 11a73fc4..2f529973 100644
--- a/src/utils/http.py
+++ b/src/utils/http.py
@@ -58,6 +58,28 @@ class Response(object):
         self.data = data
         self.headers = headers
 
+def _meta_content(s: str) -> typing.Dict[str, str]:
+    out = {}
+    for keyvalue in str.split(";"):
+        key, _, value = keyvalue.strip().partition("=")
+        out[key] = value
+    return out
+
+def _find_encoding(soup: bs4.BeautifulSoup) -> typing.Optional[str]:
+    meta_charset = soup.meta.get("charset")
+    if not meta_charset == None:
+        return meta_charset
+    else:
+        meta_content_type = soup.findAll("meta",
+            {"http-equiv": lambda v: (v or "").lower() == "content-type"})
+        if meta_content_type:
+            return _meta_content(meta_content_type[0].get("content"))["charset"]
+        else:
+            doctype = [item for item in soup.contents if isinstance(item,
+                bs4.Doctype)] or None
+            if doctype and doctype[0] == "html":
+                return "utf8"
+
 def request(url: str, method: str="GET", get_params: dict={},
         post_data: typing.Any=None, headers: dict={},
         json_data: typing.Any=None, code: bool=False, json: bool=False,
@@ -85,15 +107,25 @@ def request(url: str, method: str="GET", get_params: dict={},
                 allow_redirects=allow_redirects,
                 stream=True
             )
-            response_content = response.raw.read(RESPONSE_MAX, decode_content=True)
+            response_content = response.raw.read(RESPONSE_MAX,
+                decode_content=True)
+            if not response_content or not response.raw.read(1) == b"":
+                # response too large!
+                pass
         except utils.DeadlineExceededException:
             raise HTTPTimeoutException()
 
     response_headers = utils.CaseInsensitiveDict(dict(response.headers))
     content_type = response.headers.get("Content-Type", "").split(";", 1)[0]
 
+    souped = None
+    encoding = response.encoding
+    if content_type and content_type in SOUP_CONTENT_TYPES:
+        souped = bs4.BeautifulSoup(response_content, parser)
+        encoding = _find_encoding(souped) or encoding
+
     def _decode_data():
-        return response_content.decode(response.encoding or fallback_encoding)
+        return response_content.decode(encoding)
 
     if soup:
         if not check_content_type or content_type in SOUP_CONTENT_TYPES:
author	jesopo	2019-09-09 14:10:58 +0100
committer	jesopo	2019-09-09 14:11:18 +0100
commit	ff9c82bf671c31561231d0290f5ddaf0a98a088d (patch)
tree	1afd6fc241a52dc1d7e3d21fad843a1f49ab7c4a
parent	remove redundant (and maybe wrong) GET params from webfinger call (diff)
signature