From 69d58eede2e9bf83aa1ed1d8fcf956efde494726 Mon Sep 17 00:00:00 2001 From: jesopo Date: Wed, 3 Oct 2018 13:22:37 +0100 Subject: Move src/Utils.py in to src/utils/, splitting functionality out in to modules of related functionality --- src/utils/http.py | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 src/utils/http.py (limited to 'src/utils/http.py') diff --git a/src/utils/http.py b/src/utils/http.py new file mode 100644 index 00000000..e9cb41f9 --- /dev/null +++ b/src/utils/http.py @@ -0,0 +1,94 @@ +import re, traceback, urllib.error, urllib.parse, urllib.request +import json, ssl +import bs4 + +USER_AGENT = ("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36") +REGEX_HTTP = re.compile("https?://", re.I) + +def get_url(url, **kwargs): + if not urllib.parse.urlparse(url).scheme: + url = "http://%s" % url + url_parsed = urllib.parse.urlparse(url) + + method = kwargs.get("method", "GET") + get_params = kwargs.get("get_params", "") + post_params = kwargs.get("post_params", None) + headers = kwargs.get("headers", {}) + if get_params: + get_params = "?%s" % urllib.parse.urlencode(get_params) + if post_params: + post_params = urllib.parse.urlencode(post_params).encode("utf8") + url = "%s%s" % (url, get_params) + try: + url.encode("latin-1") + except UnicodeEncodeError: + if kwargs.get("code"): + return 0, False + return False + + request = urllib.request.Request(url, post_params) + request.add_header("Accept-Language", "en-US") + request.add_header("User-Agent", USER_AGENT) + for header, value in headers.items(): + request.add_header(header, value) + request.method = method + + try: + response = urllib.request.urlopen(request, timeout=5) + except urllib.error.HTTPError as e: + traceback.print_exc() + if kwargs.get("code"): + return e.code, False + return False + except urllib.error.URLError as e: + traceback.print_exc() + if kwargs.get("code"): + return -1, False + return False + except ssl.CertificateError as e: + traceback.print_exc() + if kwargs.get("code"): + return -1, False, + return False + + response_content = response.read() + encoding = response.info().get_content_charset() + if kwargs.get("soup"): + return bs4.BeautifulSoup(response_content, kwargs.get("parser", "lxml")) + if not encoding: + soup = bs4.BeautifulSoup(response_content, kwargs.get("parser", "lxml")) + metas = soup.find_all("meta") + for meta in metas: + if "charset=" in meta.get("content", ""): + encoding = meta.get("content").split("charset=", 1)[1 + ].split(";", 1)[0] + elif meta.get("charset", ""): + encoding = meta.get("charset") + else: + continue + break + if not encoding: + for item in soup.contents: + if isinstance(item, bs4.Doctype): + if item == "html": + encoding = "utf8" + else: + encoding = "latin-1" + break + response_content = response_content.decode(encoding or "utf8") + data = response_content + if kwargs.get("json") and data: + try: + data = json.loads(response_content) + except json.decoder.JSONDecodeError: + traceback.print_exc() + return False + if kwargs.get("code"): + return response.code, data + else: + return data + +def strip_html(s): + return bs4.BeautifulSoup(s, "lxml").get_text() + -- cgit v1.3.1-10-gc9f91