wikibot-manual/archivebot.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# https://github.com/emijrp/internet-archive/raw/master/archivebot.py

# Copyright (C) 2018-2019 Archive Team
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import collections
import datetime
import json
import re
import sys
import time
import urllib.parse
import urllib.request
import pywikibot
import pywikibot.pagegenerators as pagegenerators

import archiveteamfun


#{{ stubbed out changes call, only valid if the page is only edited by the bot.
import atexit
BATCH_SIZE = 50
pending_changes = {}
_original_save = pywikibot.Page.save
site = pywikibot.Site()

def commit_pending_changes():
    global pending_changes
    if not pending_changes: return
    print('[Wiki] Submitting %s changes.' % (len(pending_changes)))
    for title, data in pending_changes.items():
        page = pywikibot.Page(site, title)
        if data['id'] != page.latest_revision_id:
            continue
            # If the page was edited in the mean time, don't update it.
            # (The bot reorders /list pages, but that can happen latter, I'd rather we not lose URLs the bot doesn't know about.)
        page.text = data['text']
        _original_save(page, data['summary'])
        print("[Wiki] Page [[%(title)s{title}]] updated with summary: %(summary)s" % {'title': title, 'summary': data['summary']})
    pending_changes.clear()

def stub_save(self, summary = ""):
    global pending_changes
    pending_changes[self.title()] = {'text': self.text, 'summary': summary, 'id': self.latest_revision_id}
    print("[Wiki Stub] Saved [[%(page)s]] locally with summary: %(summary)s" % {'page': self.title(), 'summary': summary})
    if len(pending_changes) >= BATCH_SIZE: commit_pending_changes()

atexit.register(commit_pending_changes)
pywikibot.Page.save = stub_save
#}}

Entry = collections.namedtuple('Entry', ('sorturl', 'url', 'label', 'note', 'line'))
truncationpattern = re.compile(r'^[^:/]+://(www\.)?')

def parselistline(line):
    label = None
    note = None
    if '|' in line:
        url, rest = line.split('|', 1)
        args = map(str.strip, rest.split('|'))
        for position, arg in enumerate(args):
            if '=' in arg:
                key, value = map(str.strip, arg.split('=', 1))
                if key == 'label':
                    label = value
                    continue
                elif key == 'note':
                    note = value
                    continue
                # If it's neither, just treat it like it didn't have any '=' to begin with...
            if position == 0:
                label = arg
            elif position == 1:
                note = arg
            # Everything else is ignored
    else:
        url = line
    url = url.strip()
    if '://' in url and not '/' in url.split('://')[1]:
        url = url + '/'
    line = url + (' | label = ' + label if label else '') + (' | note = ' + note if note else '')
    sorturl = truncationpattern.sub('', url).lower()
    for domain in ('transfer.sh', 'transfer.kiska.pw', 'transfer.archivete.am', 'transfer.notkiska.pw', 'ix.io'):
        if domain == 'ix.io' and '+' not in sorturl:
            # Only apply this stripping to the undocumented trick URLs of format ix.io/code+/filename
            continue
        if sorturl.startswith(domain) and sum(x == '/' for x in sorturl) == 2:
            # For file hosting URLs that contain exactly two slashes, strip the first path component = the random file ID to sort by the filename instead.
            sorturl = domain + sorturl[sorturl.index('/', len(domain) + 1):]
    return Entry(sorturl = sorturl, url = url, label = label, note = note, line = line)

def curateurls(wlist=''):
    # Returns a dict of sectionname => list of URLs entries
    # sectionname is None for URLs outside of a section (i.e. on a page without section or before the first section).
    # A "URL entry" in the list is an Entry object (namedtuple); the label is None if it isn't present.

    lines = []
    currentsectionname = None
    currentsectionentries = []
    sectionentries = {}

    def endsection():
        nonlocal currentsectionentries, lines, sectionentries, currentsectionname
        currentsectionentries = list(set(currentsectionentries)) # Deduplicate
        currentsectionentries.sort(key = lambda x: (x.sorturl, x.label if x.label is not None else '', x.url, x.note if x.note is not None else '', x.line))
        lines.extend(x.line for x in currentsectionentries)
        sectionentries[currentsectionname] = currentsectionentries
        currentsectionentries = []

    for line in wlist.text.strip().splitlines():
        if line.strip().startswith('='):
            # New section, sort and append previous section
            endsection()
            currentsectionname = line.strip().strip('=').strip()
            if currentsectionname in sectionentries:
                print('Warning: duplicate section name {!r} on page {}'.format(currentsectionname, wlist.title()))
            if lines:
                lines.append('')
            lines.append(line.strip())
        elif line.strip():
            currentsectionentries.append(parselistline(line))
    endsection()

    lines = '\n'.join(lines)
    if wlist.text != lines:
        wlist.text = lines
        wlist.save("BOT - Sorting list")

    return sectionentries

def main():
    atsite = pywikibot.Site('en', 'ArchiveTeam')
    cat = pywikibot.Category(atsite, "Category:ArchiveBot")
    gen = pagegenerators.CategorizedPageGenerator(cat, start="!")
    pre = pagegenerators.PreloadingGenerator(gen)
    listlenlimit = 1000
    for page in pre:
        wtitle = page.title()
        wtext = page.text
        
        if len(sys.argv)>1 and not sys.argv[1] in wtitle:
            continue
        
        if not wtitle.startswith('ArchiveBot/'):
            continue
        wlist = pywikibot.Page(atsite, '%s/list' % (wtitle))
        if not wlist.exists():
            print("Page %s/list doesnt exist" % (wtitle))
            continue
        sectionentries = curateurls(wlist=wlist)
        
        print('\n===', wtitle, '===')
        if (not '<!-- bot -->' in wtext and not '<!-- bot:' in wtext) or not '<!-- /bot -->' in wtext:
            print("No <!-- bot --> tag. Skiping...")
            continue
        if len(wlist.text.splitlines()) > listlenlimit:
            continue

        newtext = []
        totaljobsize = 0
        totalsaved = 0
        totalnotsaved = 0

        # Find blocks of page text that end with a bot tag
        blocks = wtext.split('<!-- /bot -->')

        # The last block must be tag-free, so only iterate over the previous ones
        for block in blocks[:-1]:
            # Find beginning of bot tag
            pos = block.find('<!-- bot -->')
            if pos == -1:
                pos = block.find('<!-- bot:')
            if pos == -1:
                print('Block is missing opening tag, skipping...')
                newtext.append(block)
                newtext.append('<!-- /bot -->')
                continue

            if block[pos:].startswith('<!-- bot -->'):
                # Sectionless tag, use section None
                section = None
                openingtag = '<!-- bot -->'
            elif block[pos:].startswith('<!-- bot:'):
                # Extract section name
                openend = block.find('-->', pos)
                if openend == -1:
                    print("Block's opening tag does not have an end, skipping...")
                    newtext.append(block)
                    newtext.append('<!-- /bot -->')
                    continue
                section = block[pos + 9:openend].strip() # 9 = len('<!-- bot:')
                openingtag = block[pos:openend + 3]
            else:
                print('Block has an invalid bot tag, skipping...')
                newtext.append(block)
                newtext.append('<!-- /bot -->')
                continue

            if section not in sectionentries:
                print('Block references section {!r} which does not exist, skipping...'.format(section))
                newtext.append(block)
                newtext.append('<!-- /bot -->')
                continue

            # Add prefixed text (if any)
            newtext.append(block[:pos])

            # Add opening tag (as it was before)
            newtext.append(openingtag)

            # Generate table
            c = 1
            rowsplain = ""
            sectionjobsize = 0
            sectionhasnotes = any(entry.note is not None for entry in sectionentries[section])
            for entry in sectionentries[section]:
                viewerplain = ''
                viewerdetailsplain = ''
                viewer = [archiveteamfun.getArchiveDetails(url=entry.url)]
                if viewer[0][0]:
                    viewerplain = "{{saved}}"
                    viewerdetailsplain = viewer[0][1]
                    sectionjobsize += viewer[0][2]
                else:
                    viewerplain = "{{notsaved}}"
                    viewerdetailsplain = ''
                rowspan = len(re.findall(r'\|-', viewerdetailsplain))+1
                rowspanplain = 'rowspan=%d | ' % (rowspan) if rowspan>1 else ''
                if entry.label:
                    urllabel = '{{URLAB|1=%s|2=%s}}' % (entry.url, entry.label)
                else:
                    urllabel = '{{URLAB|1=%s}}' % (entry.url)
                if sectionhasnotes:
                    notescolumn = '%s%s || ' % (rowspanplain, entry.note if entry.note is not None else '')
                else:
                    notescolumn = ''
                rowsplain += "\n|-\n| %s%s || %s%s%s\n%s " % (rowspanplain, urllabel, notescolumn, rowspanplain, viewerplain, viewerdetailsplain if viewerdetailsplain else '|  ||  ||  ||  ||  || ')
                c += 1

            totaljobsize += sectionjobsize
            sectionsaved = rowsplain.count('{{saved}}')
            totalsaved += sectionsaved
            sectionnotsaved = rowsplain.count('{{notsaved}}')
            totalnotsaved += sectionnotsaved
            notesheader = 'rowspan=2 | Notes !! ' if sectionhasnotes else ''
            output = """
* '''Statistics''': {{saved}} (%s){{·}} {{notsaved}} (%s){{·}} Total size (%s)

Do not edit this table, it is automatically updated by bot. There is a [[{{FULLPAGENAME}}/list|raw list]] of URLs that you can edit.

{| class="wikitable sortable plainlinks"
! rowspan=2 | Website !! %srowspan=2 | Status !! colspan=6 | Archive details
|-
! AB Mode !! Domain !! Job !! Date !! Size !! Objects %s
|}
""" % (sectionsaved, sectionnotsaved, archiveteamfun.convertsize(b=sectionjobsize), notesheader, rowsplain)
            newtext.append(output)

            newtext.append('<!-- /bot -->')

        # Add the last, tag-free block
        newtext.append(blocks[-1])

        newtext = ''.join(newtext)

        # Replace total statistics if necessary
        if '<!-- bot-total-stats -->' in newtext:
            newtext = re.sub(r'<!-- bot-total-stats -->.*?<!-- /bot-total-stats -->', "<!-- bot-total-stats -->'''Statistics''': {{saved}} (%s)){{·}} {{notsaved}} (%s){{·}} Total size (%s)<!-- /bot-total-stats -->" % (totalsaved, totalnotsaved, archiveteamfun.convertsize(b = totaljobsize)), newtext)

        if wtext != newtext:
            pywikibot.showDiff(wtext, newtext)
            page.text = newtext
            try:
                page.save("BOT - Updating page: {{saved}} (%s), {{notsaved}} (%s), Total size (%s)" % (totalsaved, totalnotsaved, archiveteamfun.convertsize(b=totaljobsize)))
            except:
                print("Error while saving...")
        else:
            print("No changes needed in", page.title())
    
    archiveteamfun.cleanArchiveBotCache()

if __name__ == '__main__':
    main()