wikibot/wbmexclusions.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59

import collections
import pywikibot
import re

extractionPattern = re.compile('[^:/]+://(?:www\\.)?(\\S+)')
countMarkBegin = '<!-- atwikibot:urlCount -->'
countMarkEnd = '<!-- /atwikibot:urlCount -->'

def get_cleaned_url(line):
    # Extract the domain from the first URL appearing on a line, stripping away a leading "www." if any; returns None if no URL is found
    match = extractionPattern.search(line)
    if match:
        return match.group(1).lower()
    return None


def handle_page(site, page):
    # Extract domains from lines
    entries = collections.deque((line, get_cleaned_url(line)) for line in page.text.split('\n'))

    # Identify blocks of URLs and sort them
    entries.append((None, None)) # Dummy entry at the end to trigger a last sorting if necessary
    output = []
    currentBlock = []
    urlCount = 0
    while entries:
        line, url = entries.popleft()
        if url is None:
            # Either a line without a URL or the dummy entry at the end
            if currentBlock:
                currentBlock.sort(key = lambda x: x[1])
                output.extend(x[0] for x in currentBlock)
                urlCount += len(currentBlock)
                currentBlock = []
            if line is not None: # Ignore the dummy entry
                output.append(line)
        elif line is not None:
            # line and url are not None, i.e. this is a line with a URL in it
            currentBlock.append((line, url))

    outputStr = '\n'.join(output)
    if countMarkBegin in outputStr and countMarkEnd in outputStr:
        countMarkBeginPos = outputStr.index(countMarkBegin)
        countMarkEndPos = outputStr.find(countMarkEnd, countMarkBeginPos) # End mark could be before begin mark
        if countMarkEndPos != -1:
            outputStr = outputStr[:countMarkBeginPos] + countMarkBegin + 'This list currently contains ' + str(urlCount) + ' URL' + ('s' if urlCount != 1 else '') + '.' + countMarkEnd + outputStr[countMarkEndPos + len(countMarkEnd):]

    # Update if necessary
    if page.text != outputStr:
        site.login()
        page.text = outputStr
        page.save("Reordered websites and/or updated count.")

def main():
    site = pywikibot.Site('en', 'ArchiveTeam')
    for page in site.allpages(prefix = 'List of websites excluded from the Wayback Machine'):
        handle_page(site, page)

main()