diff options
| author | 2019-09-09 14:10:58 +0100 | |
|---|---|---|
| committer | 2019-09-09 14:11:18 +0100 | |
| commit | ff9c82bf671c31561231d0290f5ddaf0a98a088d (patch) | |
| tree | 1afd6fc241a52dc1d7e3d21fad843a1f49ab7c4a | |
| parent | remove redundant (and maybe wrong) GET params from webfinger call (diff) | |
| signature | ||
change utils.http.request to best-effort detect on-page encoding
closes #113
| -rw-r--r-- | src/utils/http.py | 36 |
1 files changed, 34 insertions, 2 deletions
diff --git a/src/utils/http.py b/src/utils/http.py index 11a73fc4..2f529973 100644 --- a/src/utils/http.py +++ b/src/utils/http.py @@ -58,6 +58,28 @@ class Response(object): self.data = data self.headers = headers +def _meta_content(s: str) -> typing.Dict[str, str]: + out = {} + for keyvalue in str.split(";"): + key, _, value = keyvalue.strip().partition("=") + out[key] = value + return out + +def _find_encoding(soup: bs4.BeautifulSoup) -> typing.Optional[str]: + meta_charset = soup.meta.get("charset") + if not meta_charset == None: + return meta_charset + else: + meta_content_type = soup.findAll("meta", + {"http-equiv": lambda v: (v or "").lower() == "content-type"}) + if meta_content_type: + return _meta_content(meta_content_type[0].get("content"))["charset"] + else: + doctype = [item for item in soup.contents if isinstance(item, + bs4.Doctype)] or None + if doctype and doctype[0] == "html": + return "utf8" + def request(url: str, method: str="GET", get_params: dict={}, post_data: typing.Any=None, headers: dict={}, json_data: typing.Any=None, code: bool=False, json: bool=False, @@ -85,15 +107,25 @@ def request(url: str, method: str="GET", get_params: dict={}, allow_redirects=allow_redirects, stream=True ) - response_content = response.raw.read(RESPONSE_MAX, decode_content=True) + response_content = response.raw.read(RESPONSE_MAX, + decode_content=True) + if not response_content or not response.raw.read(1) == b"": + # response too large! + pass except utils.DeadlineExceededException: raise HTTPTimeoutException() response_headers = utils.CaseInsensitiveDict(dict(response.headers)) content_type = response.headers.get("Content-Type", "").split(";", 1)[0] + souped = None + encoding = response.encoding + if content_type and content_type in SOUP_CONTENT_TYPES: + souped = bs4.BeautifulSoup(response_content, parser) + encoding = _find_encoding(souped) or encoding + def _decode_data(): - return response_content.decode(response.encoding or fallback_encoding) + return response_content.decode(encoding) if soup: if not check_content_type or content_type in SOUP_CONTENT_TYPES: |
