summaryrefslogtreecommitdiff
path: root/wikibot
diff options
context:
space:
mode:
authorGravatar klea2025-12-06 02:57:24 +0100
committerGravatar klea2025-12-06 02:57:24 +0100
commitdca4301c8aa7a6397a28dd82143f06bd10d23e4a (patch)
treeff61fa71fe0669194d426ae5076b71e2b62f7371 /wikibot
parentwb(wbmexclusions): fix python SyntaxWarning for regex (diff)
signature
wikibot(wbmexclusions): handle subpages of it
Diffstat (limited to 'wikibot')
-rw-r--r--wikibot/wbmexclusions.py84
1 files changed, 43 insertions, 41 deletions
diff --git a/wikibot/wbmexclusions.py b/wikibot/wbmexclusions.py
index a53ef73..c062de2 100644
--- a/wikibot/wbmexclusions.py
+++ b/wikibot/wbmexclusions.py
@@ -10,52 +10,54 @@ countMarkEnd = '<!-- /atwikibot:urlCount -->'
def get_cleaned_domain(line):
- # Extract the domain from the first URL appearing on a line, stripping away a leading "www." if any; returns None if no URL is found
- match = extractionPattern.search(line)
- if match:
- return match.group(1)
- return None
+ # Extract the domain from the first URL appearing on a line, stripping away a leading "www." if any; returns None if no URL is found
+ match = extractionPattern.search(line)
+ if match:
+ return match.group(1)
+ return None
-def main():
- site = mwclient.Site('wiki.archiveteam.org', path = '/')
-
- page = site.Pages['List of websites excluded from the Wayback Machine']
+def handle_page(site, page):
+ # Extract domains from lines
+ entries = collections.deque((line, get_cleaned_domain(line)) for line in page.text().split('\n'))
- # Extract domains from lines
- entries = collections.deque((line, get_cleaned_domain(line)) for line in page.text().split('\n'))
+ # Identify blocks of URLs and sort them
+ entries.append((None, None)) # Dummy entry at the end to trigger a last sorting if necessary
+ output = []
+ currentBlock = []
+ urlCount = 0
+ while entries:
+ line, domain = entries.popleft()
+ if domain is None:
+ # Either a line without a URL or the dummy entry at the end
+ if currentBlock:
+ currentBlock.sort(key = lambda x: x[1])
+ output.extend(x[0] for x in currentBlock)
+ urlCount += len(currentBlock)
+ currentBlock = []
+ if line is not None: # Ignore the dummy entry
+ output.append(line)
+ elif line is not None:
+ # line and domain are not None, i.e. this is a line with a URL in it
+ currentBlock.append((line, domain))
- # Identify blocks of URLs and sort them
- entries.append((None, None)) # Dummy entry at the end to trigger a last sorting if necessary
- output = []
- currentBlock = []
- urlCount = 0
- while entries:
- line, domain = entries.popleft()
- if domain is None:
- # Either a line without a URL or the dummy entry at the end
- if currentBlock:
- currentBlock.sort(key = lambda x: x[1])
- output.extend(x[0] for x in currentBlock)
- urlCount += len(currentBlock)
- currentBlock = []
- if line is not None: # Ignore the dummy entry
- output.append(line)
- elif line is not None:
- # line and domain are not None, i.e. this is a line with a URL in it
- currentBlock.append((line, domain))
+ outputStr = '\n'.join(output)
+ if countMarkBegin in outputStr and countMarkEnd in outputStr:
+ countMarkBeginPos = outputStr.index(countMarkBegin)
+ countMarkEndPos = outputStr.find(countMarkEnd, countMarkBeginPos) # End mark could be before begin mark
+ if countMarkEndPos != -1:
+ outputStr = outputStr[:countMarkBeginPos] + countMarkBegin + 'This list currently contains ' + str(urlCount) + ' URL' + ('s' if urlCount != 1 else '') + '.' + countMarkEnd + outputStr[countMarkEndPos + len(countMarkEnd):]
- outputStr = '\n'.join(output)
- if countMarkBegin in outputStr and countMarkEnd in outputStr:
- countMarkBeginPos = outputStr.index(countMarkBegin)
- countMarkEndPos = outputStr.find(countMarkEnd, countMarkBeginPos) # End mark could be before begin mark
- if countMarkEndPos != -1:
- outputStr = outputStr[:countMarkBeginPos] + countMarkBegin + 'This list currently contains ' + str(urlCount) + ' URL' + ('s' if urlCount != 1 else '') + '.' + countMarkEnd + outputStr[countMarkEndPos + len(countMarkEnd):]
-
- # Update if necessary
- if page.text() != outputStr:
- site.login(os.environ['ATWIKIBOT_USERNAME'], os.environ['ATWIKIBOT_PASSWORD']) # Only log in when necessary
- page.save(outputStr)
+ # Update if necessary
+ if page.text() != outputStr:
+ if not site.logged_in: # Only log in when necessary
+ site.login(os.environ['ATWIKIBOT_USERNAME'], os.environ['ATWIKIBOT_PASSWORD'])
+ page.save(outputStr)
+def main():
+ site = mwclient.Site('wiki.archiveteam.org', path = '/')
+ for page in site.Pages:
+ if not page.name.startswith('List of websites excluded from the Wayback Machine'): continue
+ handle_page(site, page)
main()