wikibot(wbmexclusions): handle subpages of it

author: klea 2025-12-06 02:57:24 +0100
committer: klea 2025-12-06 02:57:24 +0100
commit: dca4301c8aa7a6397a28dd82143f06bd10d23e4a (patch)
tree: ff61fa71fe0669194d426ae5076b71e2b62f7371 /wikibot
parent: wb(wbmexclusions): fix python SyntaxWarning for regex (diff)
1 files changed, 43 insertions, 41 deletions
diff --git a/wikibot/wbmexclusions.py b/wikibot/wbmexclusions.py
index a53ef73..c062de2 100644
--- a/wikibot/wbmexclusions.py
+++ b/wikibot/wbmexclusions.py
@@ -10,52 +10,54 @@ countMarkEnd = '<!-- /atwikibot:urlCount -->'
 
 
 def get_cleaned_domain(line):
-	# Extract the domain from the first URL appearing on a line, stripping away a leading "www." if any; returns None if no URL is found
-	match = extractionPattern.search(line)
-	if match:
-		return match.group(1)
-	return None
+    # Extract the domain from the first URL appearing on a line, stripping away a leading "www." if any; returns None if no URL is found
+    match = extractionPattern.search(line)
+    if match:
+        return match.group(1)
+    return None
 
 
-def main():
-	site = mwclient.Site('wiki.archiveteam.org', path = '/')
-
-	page = site.Pages['List of websites excluded from the Wayback Machine']
+def handle_page(site, page):
+    # Extract domains from lines
+    entries = collections.deque((line, get_cleaned_domain(line)) for line in page.text().split('\n'))
 
-	# Extract domains from lines
-	entries = collections.deque((line, get_cleaned_domain(line)) for line in page.text().split('\n'))
+    # Identify blocks of URLs and sort them
+    entries.append((None, None)) # Dummy entry at the end to trigger a last sorting if necessary
+    output = []
+    currentBlock = []
+    urlCount = 0
+    while entries:
+        line, domain = entries.popleft()
+        if domain is None:
+            # Either a line without a URL or the dummy entry at the end
+            if currentBlock:
+                currentBlock.sort(key = lambda x: x[1])
+                output.extend(x[0] for x in currentBlock)
+                urlCount += len(currentBlock)
+                currentBlock = []
+            if line is not None: # Ignore the dummy entry
+                output.append(line)
+        elif line is not None:
+            # line and domain are not None, i.e. this is a line with a URL in it
+            currentBlock.append((line, domain))
 
-	# Identify blocks of URLs and sort them
-	entries.append((None, None)) # Dummy entry at the end to trigger a last sorting if necessary
-	output = []
-	currentBlock = []
-	urlCount = 0
-	while entries:
-		line, domain = entries.popleft()
-		if domain is None:
-			# Either a line without a URL or the dummy entry at the end
-			if currentBlock:
-				currentBlock.sort(key = lambda x: x[1])
-				output.extend(x[0] for x in currentBlock)
-				urlCount += len(currentBlock)
-				currentBlock = []
-			if line is not None: # Ignore the dummy entry
-				output.append(line)
-		elif line is not None:
-			# line and domain are not None, i.e. this is a line with a URL in it
-			currentBlock.append((line, domain))
+    outputStr = '\n'.join(output)
+    if countMarkBegin in outputStr and countMarkEnd in outputStr:
+        countMarkBeginPos = outputStr.index(countMarkBegin)
+        countMarkEndPos = outputStr.find(countMarkEnd, countMarkBeginPos) # End mark could be before begin mark
+        if countMarkEndPos != -1:
+            outputStr = outputStr[:countMarkBeginPos] + countMarkBegin + 'This list currently contains ' + str(urlCount) + ' URL' + ('s' if urlCount != 1 else '') + '.' + countMarkEnd + outputStr[countMarkEndPos + len(countMarkEnd):]
 
-	outputStr = '\n'.join(output)
-	if countMarkBegin in outputStr and countMarkEnd in outputStr:
-		countMarkBeginPos = outputStr.index(countMarkBegin)
-		countMarkEndPos = outputStr.find(countMarkEnd, countMarkBeginPos) # End mark could be before begin mark
-		if countMarkEndPos != -1:
-			outputStr = outputStr[:countMarkBeginPos] + countMarkBegin + 'This list currently contains ' + str(urlCount) + ' URL' + ('s' if urlCount != 1 else '') + '.' + countMarkEnd + outputStr[countMarkEndPos + len(countMarkEnd):]
-
-	# Update if necessary
-	if page.text() != outputStr:
-		site.login(os.environ['ATWIKIBOT_USERNAME'], os.environ['ATWIKIBOT_PASSWORD']) # Only log in when necessary
-		page.save(outputStr)
+    # Update if necessary
+    if page.text() != outputStr:
+        if not site.logged_in: # Only log in when necessary
+            site.login(os.environ['ATWIKIBOT_USERNAME'], os.environ['ATWIKIBOT_PASSWORD'])
+        page.save(outputStr)
 
+def main():
+    site = mwclient.Site('wiki.archiveteam.org', path = '/')
+    for page in site.Pages:
+        if not page.name.startswith('List of websites excluded from the Wayback Machine'): continue
+        handle_page(site, page)
 
 main()
author	klea	2025-12-06 02:57:24 +0100
committer	klea	2025-12-06 02:57:24 +0100
commit	dca4301c8aa7a6397a28dd82143f06bd10d23e4a (patch)
tree	ff61fa71fe0669194d426ae5076b71e2b62f7371 /wikibot
parent	wb(wbmexclusions): fix python SyntaxWarning for regex (diff)
signature