From ed775ddbe3d66d1218ce034de97ad4ea641002bf Mon Sep 17 00:00:00 2001 From: jesopo Date: Tue, 26 Nov 2019 11:35:56 +0000 Subject: remove `parser` from utils.http.Request, add Request.soup() --- src/utils/http.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'src') diff --git a/src/utils/http.py b/src/utils/http.py index 9e9bf3e6..80bf5eae 100644 --- a/src/utils/http.py +++ b/src/utils/http.py @@ -72,9 +72,7 @@ class Request(object): allow_redirects: bool = True check_content_type: bool = True - parse: bool = False detect_encoding: bool = True - parser: str = "lxml" fallback_encoding: typing.Optional[str] = None content_type: typing.Optional[str] = None proxy: typing.Optional[str] = None @@ -126,8 +124,12 @@ class Response(object): self.encoding = encoding self.headers = headers self.cookies = cookies - def json(self): + def decode(self) -> str: + return self.data + def json(self) -> typing.Any: return _json.loads(self.data) + def soup(self, parser: str="lxml") -> bs4.BeautifulSoup: + return bs4.BeautifulSoup(self.decode(), parser) def _meta_content(s: str) -> typing.Dict[str, str]: out = {} @@ -200,23 +202,12 @@ def _request(request_obj: Request) -> Response: if (request_obj.detect_encoding and response.content_type and response.content_type in SOUP_CONTENT_TYPES): - souped = bs4.BeautifulSoup(response.data, request_obj.parser) + souped = bs4.BeautifulSoup(response.data, "lxml") encoding = _find_encoding(souped) or encoding def _decode_data(): return response.data.decode(encoding) - if request_obj.parse: - if (not request_obj.check_content_type or - response.content_type in SOUP_CONTENT_TYPES): - souped = bs4.BeautifulSoup(_decode_data(), request_obj.parser) - response.data = souped - return response - else: - raise HTTPWrongContentTypeException( - "Tried to soup non-html/non-xml data (%s)" % - response.content_type) - if request_obj.json and response.data: data = _decode_data() try: -- cgit v1.3.1-10-gc9f91