From 408b89aeb71212668e9227e2d23d9439b80591c5 Mon Sep 17 00:00:00 2001
From: jesopo
Date: Mon, 2 Sep 2019 13:25:48 +0100
Subject: use \S+ for url regex (for non-ascii chars), use url_sanitize to
 catch <>

---
 src/utils/http.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'src/utils')

diff --git a/src/utils/http.py b/src/utils/http.py
index c2ecf35d..e4ec5fe9 100644
--- a/src/utils/http.py
+++ b/src/utils/http.py
@@ -5,25 +5,28 @@ import bs4, netifaces, requests
 import tornado.httpclient
 from src import utils
 
-REGEX_URL = re.compile("https?://[A-Z0-9{}]+".format(re.escape("-._~:/%?#[]@!$&'()*+,;=")), re.I)
+REGEX_URL = re.compile("https?://\S+", re.I)
+
+PAIRED_CHARACTERS = ["<>", "()"]
 
 # best-effort tidying up of URLs
 def url_sanitise(url: str):
     if not urllib.parse.urlparse(url).scheme:
         url = "http://%s" % url
 
-    if url.endswith(")"):
+    for pair_start, pair_end in PAIRED_CHARACTERS:
         # trim ")" from the end only if there's not a "(" to match it
         # google.com/) -> google.com/
         # google.com/() -> google.com/()
         # google.com/()) -> google.com/()
-
-        if "(" in url:
-            open_index = url.rfind("(")
-            other_index = url.rfind(")", 0, len(url)-1)
-            if other_index == -1 or other_index < open_index:
-                return url
-        return url[:-1]
+        if url.endswith(pair_end):
+            if pair_start in url:
+                open_index = url.rfind("(")
+                other_index = url.rfind(")", 0, len(url)-1)
+                if not other_index == -1 and other_index < open_index:
+                    url = url[:-1]
+            else:
+                url = url[:-1]
     return url
 
 USER_AGENT = ("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 "
-- 
cgit v1.3.1-10-gc9f91