wikibot(wbmexclusions): modify regex to get url

author: klea 2025-12-06 05:05:04 +0100
committer: klea 2025-12-06 05:06:28 +0100
commit: 330385aaf4ce3b5b4296b0db84cf58016cde2b75 (patch)
tree: 5e15b8939dcc97817780db978f60b56073c33fbd
parent: wikibot(wbmexclusions): handle subpages of it (diff)
1 files changed, 8 insertions, 8 deletions
diff --git a/wikibot/wbmexclusions.py b/wikibot/wbmexclusions.py
index c062de2..47e25af 100644
--- a/wikibot/wbmexclusions.py
+++ b/wikibot/wbmexclusions.py
@@ -4,22 +4,22 @@ import re
 import os
 
 
-extractionPattern = re.compile('[^:/]+://(?:www\\.)?([^/]+)')
+extractionPattern = re.compile('[^:/]+://(?:www\\.)?(\\S+)')
 countMarkBegin = '<!-- atwikibot:urlCount -->'
 countMarkEnd = '<!-- /atwikibot:urlCount -->'
 
 
-def get_cleaned_domain(line):
+def get_cleaned_url(line):
     # Extract the domain from the first URL appearing on a line, stripping away a leading "www." if any; returns None if no URL is found
     match = extractionPattern.search(line)
     if match:
-        return match.group(1)
+        return match.group(1).lower()
     return None
 
 
 def handle_page(site, page):
     # Extract domains from lines
-    entries = collections.deque((line, get_cleaned_domain(line)) for line in page.text().split('\n'))
+    entries = collections.deque((line, get_cleaned_url(line)) for line in page.text().split('\n'))
 
     # Identify blocks of URLs and sort them
     entries.append((None, None)) # Dummy entry at the end to trigger a last sorting if necessary
@@ -27,8 +27,8 @@ def handle_page(site, page):
     currentBlock = []
     urlCount = 0
     while entries:
-        line, domain = entries.popleft()
-        if domain is None:
+        line, url = entries.popleft()
+        if url is None:
             # Either a line without a URL or the dummy entry at the end
             if currentBlock:
                 currentBlock.sort(key = lambda x: x[1])
@@ -38,8 +38,8 @@ def handle_page(site, page):
             if line is not None: # Ignore the dummy entry
                 output.append(line)
         elif line is not None:
-            # line and domain are not None, i.e. this is a line with a URL in it
-            currentBlock.append((line, domain))
+            # line and url are not None, i.e. this is a line with a URL in it
+            currentBlock.append((line, url))
 
     outputStr = '\n'.join(output)
     if countMarkBegin in outputStr and countMarkEnd in outputStr:
author	klea	2025-12-06 05:05:04 +0100
committer	klea	2025-12-06 05:06:28 +0100
commit	330385aaf4ce3b5b4296b0db84cf58016cde2b75 (patch)
tree	5e15b8939dcc97817780db978f60b56073c33fbd
parent	wikibot(wbmexclusions): handle subpages of it (diff)