1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
|
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# https://github.com/emijrp/internet-archive/raw/master/archiveteamfun.py
# Copyright (C) 2018-2019 Archive Team
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import datetime
import gzip
import json
import os
import pickle
import random
import re
import sys
import _thread
import time
import unicodedata
import urllib
import urllib.request
import urllib.parse
ArchiveBotCacheFile = "/".join(__file__.split("/")[0:-1]) + "/archivebot.cache.pickle"
ArchivebotCache = {}
# Fix:
# https://archiveteam.org/index.php?title=ArchiveBot/2018_Brazilian_general_elections (portal.imprensanacional.gov.br no json = no saved?)
#
# Error no json:
"""Retry in 20 seconds...
Retrieving: https://archive.fart.website/archivebot/viewer/?q=https://transfer.notkiska.pw/kqFhq/twitter-@mattiastesfaye
Retrieving: https://archive.org/download/archiveteam_archivebot_go_20190514190001/urls-transfer.notkiska.pw-berries.space-accounts-09-May-2019-inf-20190511-012325-8grwh.json
"""
def convertsize(b=0): #bytes
if type(b) is int:
if b < 1024: #<1KiB
return '0 KiB'
elif b < 1024*1024: #<1MiB
return '%d KiB' % (b/(1024))
elif b < 1024*1024*1024: #<1GiB
return '%d MiB' % (b/(1024*1024))
elif b < 1024*1024*1024*1024: #<1TiB
return '%.1f GiB' % (b/(1024.0*1024*1024))
elif b < 1024*1024*1024*1024*1024: #<1PiB
return '%.1f TiB' % (b/(1024.0*1024*1024*1024))
elif b < 1024*1024*1024*1024*1024*1024: #<1EiB
return '%.1f PiB' % (b/(1024.0*1024*1024*1024*1024))
else:
return b
def loadArchivebotCache():
c = {}
if os.path.exists(ArchiveBotCacheFile):
with open(ArchiveBotCacheFile, 'rb') as f:
c = pickle.load(f)
return c.copy()
def removeFromArchivebotCache(url='', save=True):
global ArchivebotCache
if url and url in ArchivebotCache:
del ArchivebotCache[url]
if save:
saveArchivebotCache()
def saveArchivebotCache():
global ArchivebotCache
with open(ArchiveBotCacheFile, 'wb') as f:
pickle.dump(ArchivebotCache, f)
def cleanArchiveBotCache():
global ArchivebotCache
ArchivebotCache2 = ArchivebotCache.copy()
for url, raw in ArchivebotCache2.items():
#remove from cache urls without results
#we need to check for results in the next run
if url.startswith("https://archive.fart.website/archivebot/viewer/?q="):
if re.search(r'(?im)<em>No search results.</em>', raw):
removeFromArchivebotCache(url=url, save=False)
#remove from cache domains with many jobs (FB, TW, etc)
#these result pages change frequently
if url.startswith("https://archive.fart.website/archivebot/viewer/domain/"):
domain = url.split("https://archive.fart.website/archivebot/viewer/domain/")[1]
jobs = re.findall(r"(?im)/archivebot/viewer/job/([^<>\"]+)", raw)
if len(jobs) >= 10:
removeFromArchivebotCache(url=url, save=False)
#remove from cache jobs with problems or in progress
#we need to check wether problems were solved in the next run
if url.startswith("https://archive.fart.website/archivebot/viewer/job/"):
job = url.split("https://archive.fart.website/archivebot/viewer/job/")[1]
jsonfileurls = re.findall(r'(?im)<a href="(https://archive\.org/download/[^"<> ]+\.json)">', raw)
if not jsonfileurls and re.search(r'-%s\d{4}-\d{6}-' % (datetime.datetime.today().year), raw): #job in progress
removeFromArchivebotCache(url=url, save=False)
warcs = re.findall(r"(?im)>\s*[^<>\"]+?-(\d{8})-(\d{6})-%s[^<> ]*?\.warc\.gz\s*</a>\s*</td>\s*<td>(\d+)</td>" % (job), raw)
if not warcs and re.search(r'-%s\d{4}-\d{6}-' % (datetime.datetime.today().year), raw): #job in progress
removeFromArchivebotCache(url=url, save=False)
if 'borg.xyz/logs/' in url and not '.log' in url:
removeFromArchivebotCache(url=url, save=False)
saveArchivebotCache()
def getURL(url='', cache=False, retry=True):
global ArchivebotCache
if '8grwh' in url: #deleted jobs/jsons
return ''
if cache: #do not download if it is cached
if not ArchivebotCache: #empty dict
ArchivebotCache = loadArchivebotCache()
if url:
if url in ArchivebotCache:
#print("Using cached page for %s" % (url))
return ArchivebotCache[url]
raw = ''
#headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0' }
headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0, ArchiveWikiListGenerator) Gecko/20100101 Firefox/55.0' }
request = urllib.request.Request(url, headers=headers)
try:
print("Retrieving: %s" % (url))
response = urllib.request.urlopen(request)
if url.endswith('.gz'):
gzipFile = gzip.GzipFile(fileobj=response)
raw = gzipFile.read().strip().decode('utf-8')
else:
raw = response.read().strip().decode('utf-8')
if cache: #refresh cache
ArchivebotCache[url] = raw
if not random.randint(0, 100):
saveArchivebotCache()
except:
if url.endswith('.json'): #some .json are deleted on IA
return ''
sleep = 10 # seconds
maxsleep = 30
while retry and sleep <= maxsleep:
print('Error while retrieving: %s' % (url))
print('Retry in %s seconds...' % (sleep))
time.sleep(sleep)
try:
response = urllib.request.urlopen(request)
if url.endswith('.gz'):
gzipFile = gzip.GzipFile(fileobj=response)
raw = gzipFile.read().strip().decode('utf-8')
else:
raw = response.read().strip().decode('utf-8')
if cache: #refresh cache
ArchivebotCache[url] = raw
except:
pass
sleep = sleep * 2
return raw
def genJobDetails(mode='', domainlink='', joburl='', jobdate='', jobsize='', jobobjects='', jobaborted=False, jobproblem=False):
jobdetails = ""
if type(jobsize) is int:
if jobsize < 1024:
jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=%d | {{red|%s}} || data-sort-value=%s | %s' % (mode, domainlink, joburl, jobdate, jobsize, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects)
else:
jobcolor = 'green'
if jobaborted:
jobcolor = 'orange'
if jobproblem:
jobcolor = 'purple'
jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=%d | {{%s|%s}} || data-sort-value=%s | %s' % (mode, domainlink, joburl, jobdate, jobsize, jobcolor, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects)
else:
jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=0 | %s || data-sort-value=%s | %s' % (mode, domainlink, joburl, jobdate, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects)
return jobdetails
def getArchiveDetailsArchivebot(url='', singleurl=False):
viewerurl = 'https://archive.fart.website/archivebot/viewer/?q=' + url
origdomain = url.split('://')[1].split('/')[0]
origdomain2 = re.sub(r'(?im)^(www\d*)\.', '.', origdomain)
rawdomains = getURL(url=viewerurl, cache=False)
domains = list(set(re.findall(r"(?im)/archivebot/viewer/domain/([^<>\"]+)", rawdomains)))
if not domains: #no results for this url, remove cache
removeFromArchivebotCache(url=viewerurl)
details = []
totaljobsize = 0
jobslimit = 10 # before 10000
for domain in domains:
if domain != origdomain and not domain in origdomain and not origdomain2 in domain:
continue
urljobs = "https://archive.fart.website/archivebot/viewer/domain/" + domain
rawjobs = getURL(url=urljobs, cache=False) #false, we want the most recent list of jobs always
#remove unrelated jobs, for example googlesites, facebook, etc
rawjobs2 = ""
for rawjobcandidate in rawjobs.split("<tr>"):
if url.split('://')[1].strip('/') in rawjobcandidate:
rawjobs2 += rawjobcandidate
rawjobs = rawjobs2
jobs = re.findall(r"(?im)/archivebot/viewer/job/([^<>\"]+)\"", rawjobs)
print("jobs", jobs)
for jobid in jobs[:jobslimit]:
jobidtruncated = jobid[-5:]
urljob = "https://archive.fart.website/archivebot/viewer/job/" + jobid
print(urljob)
rawjob = getURL(url=urljob, cache=True)
jsonfileurls = re.findall(r'(?im)<a href="(https://archive\.org/download/[^<>\" ]+\.json)">', rawjob)
for jsonfileurl in jsonfileurls:
print(jsonfileurl)
if singleurl:
jsonraw = getURL(url=jsonfileurl, cache=True) #cache json from internet archive
try:
jsonfileloaded = json.loads(jsonraw)
except:
continue
if not 'url' in jsonfileloaded or ('url' in jsonfileloaded and jsonfileloaded['url'].strip('/') != url.strip('/')):
continue
jobproblem = False
warcs = re.findall(r"(?im)<a href=\"[^<>\" ]+?-(inf|shallow)-(\d{8})-(\d{6})-%s[^<> ]*?\.warc\.gz\">[^<>\"]*?</a>\s*</td>\s*<td>(\d+)</td>" % (jobidtruncated), rawjob)
print(warcs)
if not warcs:
jobproblem = True
jobdatetimes = []
for warc in warcs:
jobdatetimes.append("%s-%s" % (warc[1], warc[2]))
jobdatetimes = list(set(jobdatetimes))
jobdatetimes.sort()
for jobdatetime in jobdatetimes:
if not jobdatetime in jsonfileurl:
continue
warcsnometa = len(re.findall(r"(?im)>\s*[^<>\"]+?-(inf|shallow)-(\d{8})-(\d{6})-%s-[^<> ]*?\d+\.warc\.gz" % (jobidtruncated), rawjob))
inforshallow = list(set(re.findall(r"(?im)>\s*[^<>\"]+?-(inf|shallow)-\d{8}-\d{6}-%s[^<> ]*?\.warc\.gz" % (jobidtruncated), rawjob)))
print(jobdatetime, warcsnometa, inforshallow)
inforshallow = len(inforshallow) == 1 and inforshallow[0] or 'unknown'
toolb = inforshallow == 'inf' and 'Recursive (!a)' or 'Shallow (!ao)'
jobaborted = False
if ('%s-%s-aborted-' % (jobdatetime, jobid)) in rawjob or ('%s-%s-aborted.json' % (jobdatetime, jobid)) in rawjob:
jobaborted = True
jobdate = '-' in jobdatetime and jobdatetime.split('-')[0] or 'nodate'
jobsize = sum([jobdatetime == '%s-%s' % (warc[1], warc[2]) and int(warc[3]) or 0 for warc in warcs])
if jobdate and jobdate != 'nodate':
jobdate = '%s-%s-%s' % (jobdate[0:4], jobdate[4:6], jobdate[6:8])
#jobdetails = genJobDetails(tool=toolb, domainlink="[https://archive.fart.website/archivebot/viewer/domain/%s %s]" % (domain, domain), joburl="[https://archive.fart.website/archivebot/viewer/job/%s %s]" % (jobidtruncated, jobidtruncated), jobdate=jobdate, jobsize=jobsize, jobobjects="%d warcs" % (warcsnometa), jobaborted=jobaborted, jobproblem=jobproblem)
jobdetails = genJobDetails(mode=toolb, domainlink=domain, joburl="{{ArchiveBot job|"+jobidtruncated+"}}", jobdate=jobdate, jobsize=jobsize, jobobjects="%d warcs" % (warcsnometa), jobaborted=jobaborted, jobproblem=jobproblem)
totaljobsize += jobsize
details.append(jobdetails)
return details, totaljobsize
def getArchiveDetails(url=''):
if url and '://' in url:
if '://archive.org/' in url or \
'://www.webcitation.org/' in url:
return False, '', 0
domain = url.split('://')[1].split('/')[0]
if len(url.split(domain)[1]) > 1: #url is domain.ext/more
details, totaljobsize = getArchiveDetailsArchivebot(url=url, singleurl=True)
return details and True or False, '\n|-\n'.join(details), totaljobsize
#url is domain.ext
details, totaljobsize = getArchiveDetailsArchivebot(url=url, singleurl=False)
return details and True or False, '\n|-\n'.join(details), totaljobsize
return False, '', 0
|