From 330385aaf4ce3b5b4296b0db84cf58016cde2b75 Mon Sep 17 00:00:00 2001 From: klea Date: Sat, 6 Dec 2025 05:05:04 +0100 Subject: wikibot(wbmexclusions): modify regex to get url --- wikibot/wbmexclusions.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/wikibot/wbmexclusions.py b/wikibot/wbmexclusions.py index c062de2..47e25af 100644 --- a/wikibot/wbmexclusions.py +++ b/wikibot/wbmexclusions.py @@ -4,22 +4,22 @@ import re import os -extractionPattern = re.compile('[^:/]+://(?:www\\.)?([^/]+)') +extractionPattern = re.compile('[^:/]+://(?:www\\.)?(\\S+)') countMarkBegin = '' countMarkEnd = '' -def get_cleaned_domain(line): +def get_cleaned_url(line): # Extract the domain from the first URL appearing on a line, stripping away a leading "www." if any; returns None if no URL is found match = extractionPattern.search(line) if match: - return match.group(1) + return match.group(1).lower() return None def handle_page(site, page): # Extract domains from lines - entries = collections.deque((line, get_cleaned_domain(line)) for line in page.text().split('\n')) + entries = collections.deque((line, get_cleaned_url(line)) for line in page.text().split('\n')) # Identify blocks of URLs and sort them entries.append((None, None)) # Dummy entry at the end to trigger a last sorting if necessary @@ -27,8 +27,8 @@ def handle_page(site, page): currentBlock = [] urlCount = 0 while entries: - line, domain = entries.popleft() - if domain is None: + line, url = entries.popleft() + if url is None: # Either a line without a URL or the dummy entry at the end if currentBlock: currentBlock.sort(key = lambda x: x[1]) @@ -38,8 +38,8 @@ def handle_page(site, page): if line is not None: # Ignore the dummy entry output.append(line) elif line is not None: - # line and domain are not None, i.e. this is a line with a URL in it - currentBlock.append((line, domain)) + # line and url are not None, i.e. this is a line with a URL in it + currentBlock.append((line, url)) outputStr = '\n'.join(output) if countMarkBegin in outputStr and countMarkEnd in outputStr: -- cgit v1.3.1-10-gc9f91