summaryrefslogtreecommitdiff
path: root/wikibot/discourse.py
diff options
context:
space:
mode:
authorGravatar klea2026-01-11 18:59:19 +0000
committerGravatar klea2026-01-11 18:59:19 +0000
commite633ee84b9e33f12072a887744c3ef1d090ff4f3 (patch)
tree97be193a1b883c3d54adfb5c3af22b38caaf5ef6 /wikibot/discourse.py
parentzygolophodon: init unpatched version (diff)
signature
wikibot/discourse: readd but excludes and dryMode
Diffstat (limited to 'wikibot/discourse.py')
-rw-r--r--wikibot/discourse.py98
1 files changed, 98 insertions, 0 deletions
diff --git a/wikibot/discourse.py b/wikibot/discourse.py
new file mode 100644
index 0000000..c8ef32a
--- /dev/null
+++ b/wikibot/discourse.py
@@ -0,0 +1,98 @@
+import collections
+import pywikibot
+import re
+
+excludedPages = [
+ '[[Discourse/archived]]'
+]
+extractionPattern = re.compile('(?:^\\* \\[?ht)[^:/]+://(\\S+)')
+text_version = re.compile('\\[(?:[^\\s:/]+://\\S+) (.*)\\]')
+time_version = re.compile('\\* (.*) \\({{ArchiveBot job\\|\\S+ *}}\\)?, started ([0-9-]*)(?:[,;].+)?\\)')
+
+def get_cleaned_url(line):
+ TimeMatch = time_version.search(line)
+ if TimeMatch:
+ date = ''.join(TimeMatch.group(2).split('-'))
+ url = TimeMatch.group(1).lower()
+ content = date + " " + url
+ return (int(date), url)
+ # Extract the forum name from the first [] on a line, else domain from the first URL appearing on a line; returns None if no URL is found
+ NameMatch = text_version.search(line)
+ if NameMatch:
+ return (0, NameMatch.group(1).lower())
+ URLMatch = extractionPattern.search(line)
+ if URLMatch:
+ return (0, URLMatch.group(1).lower())
+ return None
+
+
+def handle_page(site, page):
+ if page.title(as_link=True) in excludedPages:
+ print(f"Skipping {page.title(as_link=True)} since it's excluded.")
+ return
+
+ # Extract domains from lines
+ entries = collections.deque((line, get_cleaned_url(line)) for line in page.text.split('\n'))
+
+ # Identify blocks of URLs and sort them
+ entries.append((None, None)) # Dummy entry at the end to trigger a last sorting if necessary
+ output = []
+ currentBlock = []
+ urlCount = 0
+ while entries:
+ line, url = entries.popleft()
+ if url is None:
+ # Either a line without a URL or the dummy entry at the end
+ if currentBlock:
+ currentBlock.sort(key = lambda x: x[1])
+ output.extend(x[0] for x in currentBlock)
+ urlCount += len(currentBlock)
+ currentBlock = []
+ if line is not None: # Ignore the dummy entry
+ output.append(line)
+ elif line is not None:
+ # line and url are not None, i.e. this is a line with a URL in it
+ currentBlock.append((line, url))
+
+ outputStr = '\n'.join(output)
+
+ maybeUpdate(site, page, outputStr)
+
+def maybeUpdate(site, page, newContent, comment="Reordered websites"):
+ pageTitle = page.title(as_link=True)
+ pageFilename = page.title(as_filename=True)
+ dryMode = False
+ outputDir = "./wikitexts/"
+
+ if page.text == newContent:
+ return # Doesn't make sense to try to do logic if it's the same.
+
+ try:
+ site.login()
+ except pywikibot.exceptions.NoUsernameError:
+ dryMode = True
+
+ if dryMode:
+ import os
+ if not os.path.isdir(outputDir):
+ os.makedirs(outputDir)
+
+ print(f"DRY-RUN MODE since no configured credentials. Updating {pageTitle}")
+ with open(f"{outputDir}/{pageFilename}.txt", 'a') as OriginalPage:
+ print(page.text, file=OriginalPage)
+
+ with open(f"{outputDir}/{pageFilename}~.txt", 'a') as UpdatedPage:
+ print(newContent, file=UpdatedPage)
+
+ else:
+ print(f"Saving {pageTitle} with comment \"{comment}\"")
+ page.text = newContent
+ page.save(comment)
+
+
+def main():
+ site = pywikibot.Site('en', 'ArchiveTeam')
+ for page in site.allpages(prefix = 'Discourse'):
+ handle_page(site, page)
+
+if __name__ == '__main__': main()