From 0fbd24068ed79eaf19de6800f7dcfcc5b77017c2 Mon Sep 17 00:00:00 2001 From: klea Date: Tue, 9 Dec 2025 01:03:35 +0100 Subject: wikibot(discourse): make from wbmexclusions --- wikibot/discourse.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 wikibot/discourse.py (limited to 'wikibot') diff --git a/wikibot/discourse.py b/wikibot/discourse.py new file mode 100644 index 0000000..80ca729 --- /dev/null +++ b/wikibot/discourse.py @@ -0,0 +1,60 @@ +import collections +import pywikibot +import re + +# made from: sed \ +# -e 's/^* \[//' -e 's/^* //' \ +# -e 's/^http//' -e 's/^s//' \ +# -e 's*^://**' -e 's/\..*//' \ +# wikientries_joined.txt |\ +# sort | uniq -c | sort -hr +# and then manually curated the subdomain part of the regex +subdomainPattern = '((ask|bbs|chat|community|comunidad|disc(ourse|uss(ions?|))|for(o|ums?)|help|support|www)\\.)'.replace('(','(?:') +extractionPattern = re.compile('(?:^\\* \\[?ht)[^:/]+://'+ subdomainPattern +'?(\\S+)') + +def get_cleaned_url(line): + # Extract the domain from the first URL appearing on a line, stripping away a leading "www." if any; returns None if no URL is found + match = extractionPattern.search(line) + if match: + return match.group(1).lower() + return None + + +def handle_page(site, page): + # Extract domains from lines + entries = collections.deque((line, get_cleaned_url(line)) for line in page.text.split('\n')) + + # Identify blocks of URLs and sort them + entries.append((None, None)) # Dummy entry at the end to trigger a last sorting if necessary + output = [] + currentBlock = [] + urlCount = 0 + while entries: + line, url = entries.popleft() + if url is None: + # Either a line without a URL or the dummy entry at the end + if currentBlock: + currentBlock.sort(key = lambda x: x[1]) + output.extend(x[0] for x in currentBlock) + urlCount += len(currentBlock) + currentBlock = [] + if line is not None: # Ignore the dummy entry + output.append(line) + elif line is not None: + # line and url are not None, i.e. this is a line with a URL in it + currentBlock.append((line, url)) + + outputStr = '\n'.join(output) + + # Update if necessary + if page.text != outputStr: + site.login() + page.text = outputStr + page.save("Reordered websites") + +def main(): + site = pywikibot.Site('en', 'ArchiveTeam') + for page in site.allpages(prefix = 'Discourse'): + handle_page(site, page) + +main() -- cgit v1.3.1-10-gc9f91