From 0fbd24068ed79eaf19de6800f7dcfcc5b77017c2 Mon Sep 17 00:00:00 2001
From: klea
Date: Tue, 9 Dec 2025 01:03:35 +0100
Subject: wikibot(discourse): make from wbmexclusions

---
 wikibot/discourse.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 wikibot/discourse.py

(limited to 'wikibot')

diff --git a/wikibot/discourse.py b/wikibot/discourse.py
new file mode 100644
index 0000000..80ca729
--- /dev/null
+++ b/wikibot/discourse.py
@@ -0,0 +1,60 @@
+import collections
+import pywikibot
+import re
+
+# made from: sed \
+#    -e 's/^* \[//' -e 's/^* //' \
+#    -e 's/^http//' -e 's/^s//' \
+#    -e 's*^://**' -e 's/\..*//' \
+#    wikientries_joined.txt |\
+# sort | uniq -c | sort -hr
+# and then manually curated the subdomain part of the regex
+subdomainPattern = '((ask|bbs|chat|community|comunidad|disc(ourse|uss(ions?|))|for(o|ums?)|help|support|www)\\.)'.replace('(','(?:')
+extractionPattern = re.compile('(?:^\\* \\[?ht)[^:/]+://'+ subdomainPattern +'?(\\S+)')
+
+def get_cleaned_url(line):
+    # Extract the domain from the first URL appearing on a line, stripping away a leading "www." if any; returns None if no URL is found
+    match = extractionPattern.search(line)
+    if match:
+        return match.group(1).lower()
+    return None
+
+
+def handle_page(site, page):
+    # Extract domains from lines
+    entries = collections.deque((line, get_cleaned_url(line)) for line in page.text.split('\n'))
+
+    # Identify blocks of URLs and sort them
+    entries.append((None, None)) # Dummy entry at the end to trigger a last sorting if necessary
+    output = []
+    currentBlock = []
+    urlCount = 0
+    while entries:
+        line, url = entries.popleft()
+        if url is None:
+            # Either a line without a URL or the dummy entry at the end
+            if currentBlock:
+                currentBlock.sort(key = lambda x: x[1])
+                output.extend(x[0] for x in currentBlock)
+                urlCount += len(currentBlock)
+                currentBlock = []
+            if line is not None: # Ignore the dummy entry
+                output.append(line)
+        elif line is not None:
+            # line and url are not None, i.e. this is a line with a URL in it
+            currentBlock.append((line, url))
+
+    outputStr = '\n'.join(output)
+
+    # Update if necessary
+    if page.text != outputStr:
+        site.login()
+        page.text = outputStr
+        page.save("Reordered websites")
+
+def main():
+    site = pywikibot.Site('en', 'ArchiveTeam')
+    for page in site.allpages(prefix = 'Discourse'):
+        handle_page(site, page)
+
+main()
-- 
cgit v1.3.1-10-gc9f91