1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# https://github.com/emijrp/internet-archive/raw/master/archivebot.py
# Copyright (C) 2018-2019 Archive Team
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import collections
import datetime
import json
import re
import sys
import time
import urllib.parse
import urllib.request
import pywikibot
import pywikibot.pagegenerators as pagegenerators
import archiveteamfun
#{{ stubbed out changes call, only valid if the page is only edited by the bot.
import atexit
BATCH_SIZE = 50
pending_changes = {}
_original_save = pywikibot.Page.save
site = pywikibot.Site()
def commit_pending_changes():
global pending_changes
if not pending_changes: return
print('[Wiki] Submitting %s changes.' % (len(pending_changes)))
for title, data in pending_changes.items():
page = pywikibot.Page(site, title)
if data['id'] != page.latest_revision_id:
continue
# If the page was edited in the mean time, don't update it.
# (The bot reorders /list pages, but that can happen latter, I'd rather we not lose URLs the bot doesn't know about.)
page.text = data['text']
_original_save(page, data['summary'])
print("[Wiki] Page [[%(title)s{title}]] updated with summary: %(summary)s" % {'title': title, 'summary': data['summary']})
pending_changes.clear()
def stub_save(self, summary = ""):
global pending_changes
pending_changes[self.title()] = {'text': self.text, 'summary': summary, 'id': self.latest_revision_id}
print("[Wiki Stub] Saved [[%(page)s]] locally with summary: %(summary)s" % {'page': self.title(), 'summary': summary})
if len(pending_changes) >= BATCH_SIZE: commit_pending_changes()
atexit.register(commit_pending_changes)
pywikibot.Page.save = stub_save
#}}
Entry = collections.namedtuple('Entry', ('sorturl', 'url', 'label', 'note', 'line'))
truncationpattern = re.compile(r'^[^:/]+://(www\.)?')
def parselistline(line):
label = None
note = None
if '|' in line:
url, rest = line.split('|', 1)
args = map(str.strip, rest.split('|'))
for position, arg in enumerate(args):
if '=' in arg:
key, value = map(str.strip, arg.split('=', 1))
if key == 'label':
label = value
continue
elif key == 'note':
note = value
continue
# If it's neither, just treat it like it didn't have any '=' to begin with...
if position == 0:
label = arg
elif position == 1:
note = arg
# Everything else is ignored
else:
url = line
url = url.strip()
if '://' in url and not '/' in url.split('://')[1]:
url = url + '/'
line = url + (' | label = ' + label if label else '') + (' | note = ' + note if note else '')
sorturl = truncationpattern.sub('', url).lower()
for domain in ('transfer.sh', 'transfer.kiska.pw', 'transfer.archivete.am', 'transfer.notkiska.pw', 'ix.io'):
if domain == 'ix.io' and '+' not in sorturl:
# Only apply this stripping to the undocumented trick URLs of format ix.io/code+/filename
continue
if sorturl.startswith(domain) and sum(x == '/' for x in sorturl) == 2:
# For file hosting URLs that contain exactly two slashes, strip the first path component = the random file ID to sort by the filename instead.
sorturl = domain + sorturl[sorturl.index('/', len(domain) + 1):]
return Entry(sorturl = sorturl, url = url, label = label, note = note, line = line)
def curateurls(wlist=''):
# Returns a dict of sectionname => list of URLs entries
# sectionname is None for URLs outside of a section (i.e. on a page without section or before the first section).
# A "URL entry" in the list is an Entry object (namedtuple); the label is None if it isn't present.
lines = []
currentsectionname = None
currentsectionentries = []
sectionentries = {}
def endsection():
nonlocal currentsectionentries, lines, sectionentries, currentsectionname
currentsectionentries = list(set(currentsectionentries)) # Deduplicate
currentsectionentries.sort(key = lambda x: (x.sorturl, x.label if x.label is not None else '', x.url, x.note if x.note is not None else '', x.line))
lines.extend(x.line for x in currentsectionentries)
sectionentries[currentsectionname] = currentsectionentries
currentsectionentries = []
for line in wlist.text.strip().splitlines():
if line.strip().startswith('='):
# New section, sort and append previous section
endsection()
currentsectionname = line.strip().strip('=').strip()
if currentsectionname in sectionentries:
print('Warning: duplicate section name {!r} on page {}'.format(currentsectionname, wlist.title()))
if lines:
lines.append('')
lines.append(line.strip())
elif line.strip():
currentsectionentries.append(parselistline(line))
endsection()
lines = '\n'.join(lines)
if wlist.text != lines:
wlist.text = lines
wlist.save("BOT - Sorting list")
return sectionentries
def main():
atsite = pywikibot.Site('en', 'ArchiveTeam')
cat = pywikibot.Category(atsite, "Category:ArchiveBot")
gen = pagegenerators.CategorizedPageGenerator(cat, start="!")
pre = pagegenerators.PreloadingGenerator(gen)
listlenlimit = 1000
for page in pre:
wtitle = page.title()
wtext = page.text
if len(sys.argv)>1 and not sys.argv[1] in wtitle:
continue
if not wtitle.startswith('ArchiveBot/'):
continue
wlist = pywikibot.Page(atsite, '%s/list' % (wtitle))
if not wlist.exists():
print("Page %s/list doesnt exist" % (wtitle))
continue
sectionentries = curateurls(wlist=wlist)
print('\n===', wtitle, '===')
if (not '<!-- bot -->' in wtext and not '<!-- bot:' in wtext) or not '<!-- /bot -->' in wtext:
print("No <!-- bot --> tag. Skiping...")
continue
if len(wlist.text.splitlines()) > listlenlimit:
continue
newtext = []
totaljobsize = 0
totalsaved = 0
totalnotsaved = 0
# Find blocks of page text that end with a bot tag
blocks = wtext.split('<!-- /bot -->')
# The last block must be tag-free, so only iterate over the previous ones
for block in blocks[:-1]:
# Find beginning of bot tag
pos = block.find('<!-- bot -->')
if pos == -1:
pos = block.find('<!-- bot:')
if pos == -1:
print('Block is missing opening tag, skipping...')
newtext.append(block)
newtext.append('<!-- /bot -->')
continue
if block[pos:].startswith('<!-- bot -->'):
# Sectionless tag, use section None
section = None
openingtag = '<!-- bot -->'
elif block[pos:].startswith('<!-- bot:'):
# Extract section name
openend = block.find('-->', pos)
if openend == -1:
print("Block's opening tag does not have an end, skipping...")
newtext.append(block)
newtext.append('<!-- /bot -->')
continue
section = block[pos + 9:openend].strip() # 9 = len('<!-- bot:')
openingtag = block[pos:openend + 3]
else:
print('Block has an invalid bot tag, skipping...')
newtext.append(block)
newtext.append('<!-- /bot -->')
continue
if section not in sectionentries:
print('Block references section {!r} which does not exist, skipping...'.format(section))
newtext.append(block)
newtext.append('<!-- /bot -->')
continue
# Add prefixed text (if any)
newtext.append(block[:pos])
# Add opening tag (as it was before)
newtext.append(openingtag)
# Generate table
c = 1
rowsplain = ""
sectionjobsize = 0
sectionhasnotes = any(entry.note is not None for entry in sectionentries[section])
for entry in sectionentries[section]:
viewerplain = ''
viewerdetailsplain = ''
viewer = [archiveteamfun.getArchiveDetails(url=entry.url)]
if viewer[0][0]:
viewerplain = "{{saved}}"
viewerdetailsplain = viewer[0][1]
sectionjobsize += viewer[0][2]
else:
viewerplain = "{{notsaved}}"
viewerdetailsplain = ''
rowspan = len(re.findall(r'\|-', viewerdetailsplain))+1
rowspanplain = 'rowspan=%d | ' % (rowspan) if rowspan>1 else ''
if entry.label:
urllabel = '{{URLAB|1=%s|2=%s}}' % (entry.url, entry.label)
else:
urllabel = '{{URLAB|1=%s}}' % (entry.url)
if sectionhasnotes:
notescolumn = '%s%s || ' % (rowspanplain, entry.note if entry.note is not None else '')
else:
notescolumn = ''
rowsplain += "\n|-\n| %s%s || %s%s%s\n%s " % (rowspanplain, urllabel, notescolumn, rowspanplain, viewerplain, viewerdetailsplain if viewerdetailsplain else '| || || || || || ')
c += 1
totaljobsize += sectionjobsize
sectionsaved = rowsplain.count('{{saved}}')
totalsaved += sectionsaved
sectionnotsaved = rowsplain.count('{{notsaved}}')
totalnotsaved += sectionnotsaved
notesheader = 'rowspan=2 | Notes !! ' if sectionhasnotes else ''
output = """
* '''Statistics''': {{saved}} (%s){{·}} {{notsaved}} (%s){{·}} Total size (%s)
Do not edit this table, it is automatically updated by bot. There is a [[{{FULLPAGENAME}}/list|raw list]] of URLs that you can edit.
{| class="wikitable sortable plainlinks"
! rowspan=2 | Website !! %srowspan=2 | Status !! colspan=6 | Archive details
|-
! AB Mode !! Domain !! Job !! Date !! Size !! Objects %s
|}
""" % (sectionsaved, sectionnotsaved, archiveteamfun.convertsize(b=sectionjobsize), notesheader, rowsplain)
newtext.append(output)
newtext.append('<!-- /bot -->')
# Add the last, tag-free block
newtext.append(blocks[-1])
newtext = ''.join(newtext)
# Replace total statistics if necessary
if '<!-- bot-total-stats -->' in newtext:
newtext = re.sub(r'<!-- bot-total-stats -->.*?<!-- /bot-total-stats -->', "<!-- bot-total-stats -->'''Statistics''': {{saved}} (%s)){{·}} {{notsaved}} (%s){{·}} Total size (%s)<!-- /bot-total-stats -->" % (totalsaved, totalnotsaved, archiveteamfun.convertsize(b = totaljobsize)), newtext)
if wtext != newtext:
pywikibot.showDiff(wtext, newtext)
page.text = newtext
try:
page.save("BOT - Updating page: {{saved}} (%s), {{notsaved}} (%s), Total size (%s)" % (totalsaved, totalnotsaved, archiveteamfun.convertsize(b=totaljobsize)))
except:
print("Error while saving...")
else:
print("No changes needed in", page.title())
archiveteamfun.cleanArchiveBotCache()
if __name__ == '__main__':
main()
|