src/utils/http.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105

import re, traceback, urllib.error, urllib.parse, urllib.request
import json, ssl
import bs4

USER_AGENT = ("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36")
REGEX_HTTP = re.compile("https?://", re.I)

def get_url(url, **kwargs):
    if not urllib.parse.urlparse(url).scheme:
        url = "http://%s" % url
    url_parsed = urllib.parse.urlparse(url)

    method = kwargs.get("method", "GET")
    get_params = kwargs.get("get_params", "")
    post_params = kwargs.get("post_params", None)
    post_data = kwargs.get("post_data", None)
    headers = kwargs.get("headers", {})
    return_code = kwargs.get("code", False)

    if get_params:
        get_params = "?%s" % urllib.parse.urlencode(get_params)
    if post_params:
        post_data = urllib.parse.urlencode(post_params)

    url = "%s%s" % (url, get_params)
    try:
        url.encode("latin-1")
        if post_data:
            post_data = post_data.encode("utf8")
    except UnicodeEncodeError:
        if return_code:
            return 0, False
        return False

    request = urllib.request.Request(url, post_data)
    request.add_header("Accept-Language", "en-US")
    request.add_header("User-Agent", USER_AGENT)
    for header, value in headers.items():
        request.add_header(header, value)
    request.method = method

    try:
        response = urllib.request.urlopen(request, timeout=5)
    except urllib.error.HTTPError as e:
        traceback.print_exc()
        if return_code:
            return e.code, False
        return False
    except urllib.error.URLError as e:
        traceback.print_exc()
        if kwargs.get("code"):
            return -1, False
        return False
    except ssl.CertificateError as e:
        traceback.print_exc()
        if return_code:
            return -1, False,
        return False

    response_content = response.read()
    encoding = response.info().get_content_charset()
    if kwargs.get("soup"):
        soup = bs4.BeautifulSoup(response_content, kwargs.get("parser",
            "lxml"))
        if return_code:
            return response.code, soup
        return soup

    if not encoding:
        soup = bs4.BeautifulSoup(response_content, kwargs.get("parser", "lxml"))
        metas = soup.find_all("meta")
        for meta in metas:
            if "charset=" in meta.get("content", ""):
                encoding = meta.get("content").split("charset=", 1)[1
                    ].split(";", 1)[0]
            elif meta.get("charset", ""):
                encoding = meta.get("charset")
            else:
                continue
            break
        if not encoding:
            for item in soup.contents:
                if isinstance(item, bs4.Doctype):
                    if item == "html":
                        encoding = "utf8"
                    else:
                        encoding = "latin-1"
                    break
    response_content = response_content.decode(encoding or "utf8")
    data = response_content
    if kwargs.get("json") and data:
        try:
            data = json.loads(response_content)
        except json.decoder.JSONDecodeError:
            traceback.print_exc()
            return False
    if kwargs.get("code"):
        return response.code, data
    else:
        return data

def strip_html(s):
    return bs4.BeautifulSoup(s, "lxml").get_text()