wikibot/discourse.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

import collections
import pywikibot
import re

excludedPages = [
    #'[[Discourse/archived]]',
]
extractionPattern = re.compile('(?:^\\* \\[?ht)[^:/]+://(\\S+)')
text_version = re.compile('\\[(?:[^\\s:/]+://\\S+) (.*)\\]')
time_version = re.compile('\\* (.*) \\({{ArchiveBot job\\|\\S+ *}}\\)?, started ([0-9-]*)(?:[,;].+)?\\)')

def get_cleaned_url(line):
    TimeMatch = time_version.search(line)
    if TimeMatch:
        date = ''.join(TimeMatch.group(2).split('-'))
        url = TimeMatch.group(1).lower()
        content = date + " " + url
        return (int(date), url)
    # Extract the forum name from the first [] on a line, else domain from the first URL appearing on a line; returns None if no URL is found
    NameMatch = text_version.search(line)
    if NameMatch:
        return (0, NameMatch.group(1).lower())
    URLMatch = extractionPattern.search(line)
    if URLMatch:
        return (0, URLMatch.group(1).lower())
    return None


def handle_page(site, page):
    if page.title(as_link=True) in excludedPages:
        print(f"Skipping {page.title(as_link=True)} since it's excluded.")
        return

    # Extract domains from lines
    entries = collections.deque((line, get_cleaned_url(line)) for line in page.text.split('\n'))

    # Identify blocks of URLs and sort them
    entries.append((None, None)) # Dummy entry at the end to trigger a last sorting if necessary
    output = []
    currentBlock = []
    urlCount = 0
    while entries:
        line, url = entries.popleft()
        if url is None:
            # Either a line without a URL or the dummy entry at the end
            if currentBlock:
                currentBlock.sort(key = lambda x: x[1])
                output.extend(x[0] for x in currentBlock)
                urlCount += len(currentBlock)
                currentBlock = []
            if line is not None: # Ignore the dummy entry
                output.append(line)
        elif line is not None:
            # line and url are not None, i.e. this is a line with a URL in it
            currentBlock.append((line, url))

    outputStr = '\n'.join(output)

    maybeUpdate(site, page, outputStr)

def maybeUpdate(site, page, newContent, comment="Reordered websites"):
    pageMirror = 'User:KleaBot/mirror/' + page.title(with_ns=True).replace(':','/')
    pageTitle = page.title(as_link=True)
    pageFilename = page.title(as_filename=True)
    page = pywikibot.Page(site, pageMirror)
    dryMode = False
    outputDir = "./wikitexts/"

    if page.text == newContent:
        return # Doesn't make sense to try to do logic if it's the same.

    try:
        site.login()
    except pywikibot.exceptions.NoUsernameError:
        dryMode = True

    if dryMode:
        import os
        if not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        print(f"DRY-RUN MODE since no configured credentials. Updating {pageTitle}")
        with open(f"{outputDir}/{pageFilename}.txt", 'a') as OriginalPage:
            print(page.text, file=OriginalPage)

        with open(f"{outputDir}/{pageFilename}~.txt", 'a') as UpdatedPage:
            print(newContent, file=UpdatedPage)

    else:
        print(f"Saving {pageTitle} with comment \"{comment}\"")
        page.text = newContent
        page.save(comment)


def main():
    site = pywikibot.Site('en', 'ArchiveTeam')
    for page in site.allpages(prefix = 'Discourse'):
        handle_page(site, page)

if __name__ == '__main__': main()