From e633ee84b9e33f12072a887744c3ef1d090ff4f3 Mon Sep 17 00:00:00 2001
From: klea
Date: Sun, 11 Jan 2026 18:59:19 +0000
Subject: wikibot/discourse: readd but excludes and dryMode

---
 wikibot/discourse.py | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 wikibot/discourse.py

(limited to 'wikibot/discourse.py')

diff --git a/wikibot/discourse.py b/wikibot/discourse.py
new file mode 100644
index 0000000..c8ef32a
--- /dev/null
+++ b/wikibot/discourse.py
@@ -0,0 +1,98 @@
+import collections
+import pywikibot
+import re
+
+excludedPages = [
+    '[[Discourse/archived]]'
+]
+extractionPattern = re.compile('(?:^\\* \\[?ht)[^:/]+://(\\S+)')
+text_version = re.compile('\\[(?:[^\\s:/]+://\\S+) (.*)\\]')
+time_version = re.compile('\\* (.*) \\({{ArchiveBot job\\|\\S+ *}}\\)?, started ([0-9-]*)(?:[,;].+)?\\)')
+
+def get_cleaned_url(line):
+    TimeMatch = time_version.search(line)
+    if TimeMatch:
+        date = ''.join(TimeMatch.group(2).split('-'))
+        url = TimeMatch.group(1).lower()
+        content = date + " " + url
+        return (int(date), url)
+    # Extract the forum name from the first [] on a line, else domain from the first URL appearing on a line; returns None if no URL is found
+    NameMatch = text_version.search(line)
+    if NameMatch:
+        return (0, NameMatch.group(1).lower())
+    URLMatch = extractionPattern.search(line)
+    if URLMatch:
+        return (0, URLMatch.group(1).lower())
+    return None
+
+
+def handle_page(site, page):
+    if page.title(as_link=True) in excludedPages:
+        print(f"Skipping {page.title(as_link=True)} since it's excluded.")
+        return
+
+    # Extract domains from lines
+    entries = collections.deque((line, get_cleaned_url(line)) for line in page.text.split('\n'))
+
+    # Identify blocks of URLs and sort them
+    entries.append((None, None)) # Dummy entry at the end to trigger a last sorting if necessary
+    output = []
+    currentBlock = []
+    urlCount = 0
+    while entries:
+        line, url = entries.popleft()
+        if url is None:
+            # Either a line without a URL or the dummy entry at the end
+            if currentBlock:
+                currentBlock.sort(key = lambda x: x[1])
+                output.extend(x[0] for x in currentBlock)
+                urlCount += len(currentBlock)
+                currentBlock = []
+            if line is not None: # Ignore the dummy entry
+                output.append(line)
+        elif line is not None:
+            # line and url are not None, i.e. this is a line with a URL in it
+            currentBlock.append((line, url))
+
+    outputStr = '\n'.join(output)
+
+    maybeUpdate(site, page, outputStr)
+
+def maybeUpdate(site, page, newContent, comment="Reordered websites"):
+    pageTitle = page.title(as_link=True)
+    pageFilename = page.title(as_filename=True)
+    dryMode = False
+    outputDir = "./wikitexts/"
+
+    if page.text == newContent:
+        return # Doesn't make sense to try to do logic if it's the same.
+
+    try:
+        site.login()
+    except pywikibot.exceptions.NoUsernameError:
+        dryMode = True
+
+    if dryMode:
+        import os
+        if not os.path.isdir(outputDir):
+            os.makedirs(outputDir)
+
+        print(f"DRY-RUN MODE since no configured credentials. Updating {pageTitle}")
+        with open(f"{outputDir}/{pageFilename}.txt", 'a') as OriginalPage:
+            print(page.text, file=OriginalPage)
+
+        with open(f"{outputDir}/{pageFilename}~.txt", 'a') as UpdatedPage:
+            print(newContent, file=UpdatedPage)
+
+    else:
+        print(f"Saving {pageTitle} with comment \"{comment}\"")
+        page.text = newContent
+        page.save(comment)
+
+
+def main():
+    site = pywikibot.Site('en', 'ArchiveTeam')
+    for page in site.allpages(prefix = 'Discourse'):
+        handle_page(site, page)
+
+if __name__ == '__main__': main()
-- 
cgit v1.3.1-10-gc9f91