import collections import pywikibot import re excludedPages = [ #'[[Discourse/archived]]', ] extractionPattern = re.compile('(?:^\\* \\[?ht)[^:/]+://(\\S+)') text_version = re.compile('\\[(?:[^\\s:/]+://\\S+) (.*)\\]') time_version = re.compile('\\* (.*) \\({{ArchiveBot job\\|\\S+ *}}\\)?, started ([0-9-]*)(?:[,;].+)?\\)') def get_cleaned_url(line): TimeMatch = time_version.search(line) if TimeMatch: date = ''.join(TimeMatch.group(2).split('-')) url = TimeMatch.group(1).lower() content = date + " " + url return (int(date), url) # Extract the forum name from the first [] on a line, else domain from the first URL appearing on a line; returns None if no URL is found NameMatch = text_version.search(line) if NameMatch: return (0, NameMatch.group(1).lower()) URLMatch = extractionPattern.search(line) if URLMatch: return (0, URLMatch.group(1).lower()) return None def handle_page(site, page): if page.title(as_link=True) in excludedPages: print(f"Skipping {page.title(as_link=True)} since it's excluded.") return # Extract domains from lines entries = collections.deque((line, get_cleaned_url(line)) for line in page.text.split('\n')) # Identify blocks of URLs and sort them entries.append((None, None)) # Dummy entry at the end to trigger a last sorting if necessary output = [] currentBlock = [] urlCount = 0 while entries: line, url = entries.popleft() if url is None: # Either a line without a URL or the dummy entry at the end if currentBlock: currentBlock.sort(key = lambda x: x[1]) output.extend(x[0] for x in currentBlock) urlCount += len(currentBlock) currentBlock = [] if line is not None: # Ignore the dummy entry output.append(line) elif line is not None: # line and url are not None, i.e. this is a line with a URL in it currentBlock.append((line, url)) outputStr = '\n'.join(output) maybeUpdate(site, page, outputStr) def maybeUpdate(site, page, newContent, comment="Reordered websites"): pageMirror = 'User:KleaBot/mirror/' + page.title(with_ns=True).replace(':','/') pageTitle = page.title(as_link=True) pageFilename = page.title(as_filename=True) page = pywikibot.Page(site, pageMirror) dryMode = False outputDir = "./wikitexts/" if page.text == newContent: return # Doesn't make sense to try to do logic if it's the same. try: site.login() except pywikibot.exceptions.NoUsernameError: dryMode = True if dryMode: import os if not os.path.isdir(outputDir): os.makedirs(outputDir) print(f"DRY-RUN MODE since no configured credentials. Updating {pageTitle}") with open(f"{outputDir}/{pageFilename}.txt", 'a') as OriginalPage: print(page.text, file=OriginalPage) with open(f"{outputDir}/{pageFilename}~.txt", 'a') as UpdatedPage: print(newContent, file=UpdatedPage) else: print(f"Saving {pageTitle} with comment \"{comment}\"") page.text = newContent page.save(comment) def main(): site = pywikibot.Site('en', 'ArchiveTeam') for page in site.allpages(prefix = 'Discourse'): handle_page(site, page) if __name__ == '__main__': main()