diff options
| author | 2020-02-13 21:50:33 +0000 | |
|---|---|---|
| committer | 2020-02-13 21:50:33 +0000 | |
| commit | df38d7a57f6b809de4808ed601bfc747aef1c79c (patch) | |
| tree | 714b88cf856f645a0a32034dd1c86fddecd338cd /src/utils | |
| parent | labeled response raw.received should fire BEFORE line_handler's (diff) | |
| signature | ||
replace lxml usage with html5lib! the future is cool
Diffstat (limited to 'src/utils')
| -rw-r--r-- | src/utils/http.py | 6 |
1 files changed, 3 insertions, 3 deletions
diff --git a/src/utils/http.py b/src/utils/http.py index 239ae11a..9f25b315 100644 --- a/src/utils/http.py +++ b/src/utils/http.py @@ -127,7 +127,7 @@ class Response(object): return self.data.decode(encoding or self.encoding) def json(self) -> typing.Any: return _json.loads(self.data) - def soup(self, parser: str="lxml") -> bs4.BeautifulSoup: + def soup(self, parser: str="html5lib") -> bs4.BeautifulSoup: return bs4.BeautifulSoup(self.decode(), parser) def _split_content(s: str) -> typing.Dict[str, str]: @@ -144,7 +144,7 @@ def _find_encoding(headers: typing.Dict[str, str], data: bytes if "charset" in content_header: return content_header["charset"] - soup = bs4.BeautifulSoup(data, "lxml") + soup = bs4.BeautifulSoup(data, "html5lib") if not soup.meta == None: meta_charset = soup.meta.get("charset") if not meta_charset == None: @@ -275,7 +275,7 @@ class Client(object): request_many = request_many def strip_html(s: str) -> str: - return bs4.BeautifulSoup(s, "lxml").get_text() + return bs4.BeautifulSoup(s, "html5lib").get_text() def resolve_hostname(hostname: str) -> typing.List[str]: try: |
