diff options
| author | 2019-07-02 14:10:18 +0100 | |
|---|---|---|
| committer | 2019-07-02 14:10:18 +0100 | |
| commit | 534854127be47f8892c3f3952d779d31014452df (patch) | |
| tree | 635d6c3b6fbc7593da11336985e1a948188e0fe8 /src/utils/http.py | |
| parent | mulitline-concat shouldn't be a c2c tag (diff) | |
| signature | ||
Add utils.http.url_validate() for best-effort url tidying
Diffstat (limited to 'src/utils/http.py')
| -rw-r--r-- | src/utils/http.py | 16 |
1 files changed, 16 insertions, 0 deletions
diff --git a/src/utils/http.py b/src/utils/http.py index 88555568..e65e1e23 100644 --- a/src/utils/http.py +++ b/src/utils/http.py @@ -6,6 +6,22 @@ from src import utils REGEX_URL = re.compile("https?://[A-Z0-9{}]+".format(re.escape("-._~:/%?#[]@!$&'()*+,;=")), re.I) +# best-effort tidying up of URLs +def url_validate(url: str): + if url.endswith(")"): + # trim ")" from the end only if there's not a "(" to match it + # google.com/) -> google.com/ + # google.com/() -> google.com/() + # google.com/()) -> google.com/() + + if "(" in url: + open_index = url.rfind("(") + other_index = url.rfind(")", 0, len(url)-1) + if other_index == -1 or other_index < open_index: + return url + return url[:-1] + return url + USER_AGENT = ("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36") |
