From e633ee84b9e33f12072a887744c3ef1d090ff4f3 Mon Sep 17 00:00:00 2001 From: klea Date: Sun, 11 Jan 2026 18:59:19 +0000 Subject: wikibot/discourse: readd but excludes and dryMode --- wikibot/discourse.py | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 wikibot/discourse.py (limited to 'wikibot/discourse.py') diff --git a/wikibot/discourse.py b/wikibot/discourse.py new file mode 100644 index 0000000..c8ef32a --- /dev/null +++ b/wikibot/discourse.py @@ -0,0 +1,98 @@ +import collections +import pywikibot +import re + +excludedPages = [ + '[[Discourse/archived]]' +] +extractionPattern = re.compile('(?:^\\* \\[?ht)[^:/]+://(\\S+)') +text_version = re.compile('\\[(?:[^\\s:/]+://\\S+) (.*)\\]') +time_version = re.compile('\\* (.*) \\({{ArchiveBot job\\|\\S+ *}}\\)?, started ([0-9-]*)(?:[,;].+)?\\)') + +def get_cleaned_url(line): + TimeMatch = time_version.search(line) + if TimeMatch: + date = ''.join(TimeMatch.group(2).split('-')) + url = TimeMatch.group(1).lower() + content = date + " " + url + return (int(date), url) + # Extract the forum name from the first [] on a line, else domain from the first URL appearing on a line; returns None if no URL is found + NameMatch = text_version.search(line) + if NameMatch: + return (0, NameMatch.group(1).lower()) + URLMatch = extractionPattern.search(line) + if URLMatch: + return (0, URLMatch.group(1).lower()) + return None + + +def handle_page(site, page): + if page.title(as_link=True) in excludedPages: + print(f"Skipping {page.title(as_link=True)} since it's excluded.") + return + + # Extract domains from lines + entries = collections.deque((line, get_cleaned_url(line)) for line in page.text.split('\n')) + + # Identify blocks of URLs and sort them + entries.append((None, None)) # Dummy entry at the end to trigger a last sorting if necessary + output = [] + currentBlock = [] + urlCount = 0 + while entries: + line, url = entries.popleft() + if url is None: + # Either a line without a URL or the dummy entry at the end + if currentBlock: + currentBlock.sort(key = lambda x: x[1]) + output.extend(x[0] for x in currentBlock) + urlCount += len(currentBlock) + currentBlock = [] + if line is not None: # Ignore the dummy entry + output.append(line) + elif line is not None: + # line and url are not None, i.e. this is a line with a URL in it + currentBlock.append((line, url)) + + outputStr = '\n'.join(output) + + maybeUpdate(site, page, outputStr) + +def maybeUpdate(site, page, newContent, comment="Reordered websites"): + pageTitle = page.title(as_link=True) + pageFilename = page.title(as_filename=True) + dryMode = False + outputDir = "./wikitexts/" + + if page.text == newContent: + return # Doesn't make sense to try to do logic if it's the same. + + try: + site.login() + except pywikibot.exceptions.NoUsernameError: + dryMode = True + + if dryMode: + import os + if not os.path.isdir(outputDir): + os.makedirs(outputDir) + + print(f"DRY-RUN MODE since no configured credentials. Updating {pageTitle}") + with open(f"{outputDir}/{pageFilename}.txt", 'a') as OriginalPage: + print(page.text, file=OriginalPage) + + with open(f"{outputDir}/{pageFilename}~.txt", 'a') as UpdatedPage: + print(newContent, file=UpdatedPage) + + else: + print(f"Saving {pageTitle} with comment \"{comment}\"") + page.text = newContent + page.save(comment) + + +def main(): + site = pywikibot.Site('en', 'ArchiveTeam') + for page in site.allpages(prefix = 'Discourse'): + handle_page(site, page) + +if __name__ == '__main__': main() -- cgit v1.3.1-10-gc9f91