From df38d7a57f6b809de4808ed601bfc747aef1c79c Mon Sep 17 00:00:00 2001 From: jesopo Date: Thu, 13 Feb 2020 21:50:33 +0000 Subject: replace lxml usage with html5lib! the future is cool --- src/utils/http.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/utils/http.py') diff --git a/src/utils/http.py b/src/utils/http.py index 239ae11a..9f25b315 100644 --- a/src/utils/http.py +++ b/src/utils/http.py @@ -127,7 +127,7 @@ class Response(object): return self.data.decode(encoding or self.encoding) def json(self) -> typing.Any: return _json.loads(self.data) - def soup(self, parser: str="lxml") -> bs4.BeautifulSoup: + def soup(self, parser: str="html5lib") -> bs4.BeautifulSoup: return bs4.BeautifulSoup(self.decode(), parser) def _split_content(s: str) -> typing.Dict[str, str]: @@ -144,7 +144,7 @@ def _find_encoding(headers: typing.Dict[str, str], data: bytes if "charset" in content_header: return content_header["charset"] - soup = bs4.BeautifulSoup(data, "lxml") + soup = bs4.BeautifulSoup(data, "html5lib") if not soup.meta == None: meta_charset = soup.meta.get("charset") if not meta_charset == None: @@ -275,7 +275,7 @@ class Client(object): request_many = request_many def strip_html(s: str) -> str: - return bs4.BeautifulSoup(s, "lxml").get_text() + return bs4.BeautifulSoup(s, "html5lib").get_text() def resolve_hostname(hostname: str) -> typing.List[str]: try: -- cgit v1.3.1-10-gc9f91