summaryrefslogtreecommitdiff
path: root/wikibot-manual/archiveteamfun.py
blob: 45b61f97d94b131ef86a2ead258500facbf3580e (about) (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# https://github.com/emijrp/internet-archive/raw/master/archiveteamfun.py

# Copyright (C) 2018-2019 Archive Team
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import datetime
import gzip
import json
import os
import pickle
import random
import re
import sys
import _thread
import time
import unicodedata
import urllib
import urllib.request
import urllib.parse

ArchiveBotCacheFile = "/".join(__file__.split("/")[0:-1]) + "/archivebot.cache.pickle"
ArchivebotCache = {}

# Fix:
# https://archiveteam.org/index.php?title=ArchiveBot/2018_Brazilian_general_elections (portal.imprensanacional.gov.br no json = no saved?)
#
# Error no json:
"""Retry in 20 seconds...
Retrieving: https://archive.fart.website/archivebot/viewer/?q=https://transfer.notkiska.pw/kqFhq/twitter-@mattiastesfaye
Retrieving: https://archive.org/download/archiveteam_archivebot_go_20190514190001/urls-transfer.notkiska.pw-berries.space-accounts-09-May-2019-inf-20190511-012325-8grwh.json
"""

def convertsize(b=0): #bytes
    if type(b) is int:
        if b < 1024: #<1KiB
            return '0&nbsp;KiB'
        elif b < 1024*1024: #<1MiB
            return '%d&nbsp;KiB' % (b/(1024))
        elif b < 1024*1024*1024: #<1GiB
            return '%d&nbsp;MiB' % (b/(1024*1024))
        elif b < 1024*1024*1024*1024: #<1TiB
            return '%.1f&nbsp;GiB' % (b/(1024.0*1024*1024))
        elif b < 1024*1024*1024*1024*1024: #<1PiB
            return '%.1f&nbsp;TiB' % (b/(1024.0*1024*1024*1024))
        elif b < 1024*1024*1024*1024*1024*1024: #<1EiB
            return '%.1f&nbsp;PiB' % (b/(1024.0*1024*1024*1024*1024))
    else:
        return b

def loadArchivebotCache():
    c = {}
    if os.path.exists(ArchiveBotCacheFile):
        with open(ArchiveBotCacheFile, 'rb') as f:
            c = pickle.load(f)
    return c.copy()

def removeFromArchivebotCache(url='', save=True):
    global ArchivebotCache
    if url and url in ArchivebotCache:
        del ArchivebotCache[url]
        if save:
            saveArchivebotCache()

def saveArchivebotCache():
    global ArchivebotCache
    with open(ArchiveBotCacheFile, 'wb') as f:
        pickle.dump(ArchivebotCache, f)

def cleanArchiveBotCache():
    global ArchivebotCache
    ArchivebotCache2 = ArchivebotCache.copy()

    for url, raw in ArchivebotCache2.items():
        #remove from cache urls without results
        #we need to check for results in the next run
        if url.startswith("https://archive.fart.website/archivebot/viewer/?q="):
            if re.search(r'(?im)<em>No search results.</em>', raw):
                removeFromArchivebotCache(url=url, save=False)

        #remove from cache domains with many jobs (FB, TW, etc)
        #these result pages change frequently
        if url.startswith("https://archive.fart.website/archivebot/viewer/domain/"):
            domain = url.split("https://archive.fart.website/archivebot/viewer/domain/")[1]
            jobs = re.findall(r"(?im)/archivebot/viewer/job/([^<>\"]+)", raw)
            if len(jobs) >= 10:
                removeFromArchivebotCache(url=url, save=False)

        #remove from cache jobs with problems or in progress
        #we need to check wether problems were solved in the next run
        if url.startswith("https://archive.fart.website/archivebot/viewer/job/"):
            job = url.split("https://archive.fart.website/archivebot/viewer/job/")[1]
            jsonfileurls = re.findall(r'(?im)<a href="(https://archive\.org/download/[^"<> ]+\.json)">', raw)
            if not jsonfileurls and re.search(r'-%s\d{4}-\d{6}-' % (datetime.datetime.today().year), raw): #job in progress
                removeFromArchivebotCache(url=url, save=False)
            warcs = re.findall(r"(?im)>\s*[^<>\"]+?-(\d{8})-(\d{6})-%s[^<> ]*?\.warc\.gz\s*</a>\s*</td>\s*<td>(\d+)</td>" % (job), raw)
            if not warcs and re.search(r'-%s\d{4}-\d{6}-' % (datetime.datetime.today().year), raw): #job in progress
                removeFromArchivebotCache(url=url, save=False)

        if 'borg.xyz/logs/' in url and not '.log' in url:
            removeFromArchivebotCache(url=url, save=False)

    saveArchivebotCache()

def getURL(url='', cache=False, retry=True):
    global ArchivebotCache

    if '8grwh' in url: #deleted jobs/jsons
        return ''

    if cache: #do not download if it is cached
        if not ArchivebotCache: #empty dict
            ArchivebotCache = loadArchivebotCache()
        if url:
            if url in ArchivebotCache:
                #print("Using cached page for %s" % (url))
                return ArchivebotCache[url]
    raw = ''
    #headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0' }
    headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0, ArchiveWikiListGenerator) Gecko/20100101 Firefox/55.0' }
    request = urllib.request.Request(url, headers=headers)
    try:
        print("Retrieving: %s" % (url))
        response = urllib.request.urlopen(request)
        if url.endswith('.gz'):
            gzipFile = gzip.GzipFile(fileobj=response)
            raw = gzipFile.read().strip().decode('utf-8')
        else:
            raw = response.read().strip().decode('utf-8')
        if cache: #refresh cache
            ArchivebotCache[url] = raw
            if not random.randint(0, 100):
                saveArchivebotCache()
    except:
        if url.endswith('.json'): #some .json are deleted on IA
            return ''

        sleep = 10 # seconds
        maxsleep = 30
        while retry and sleep <= maxsleep:
            print('Error while retrieving: %s' % (url))
            print('Retry in %s seconds...' % (sleep))
            time.sleep(sleep)
            try:
                response = urllib.request.urlopen(request)
                if url.endswith('.gz'):
                    gzipFile = gzip.GzipFile(fileobj=response)
                    raw = gzipFile.read().strip().decode('utf-8')
                else:
                    raw = response.read().strip().decode('utf-8')
                if cache: #refresh cache
                    ArchivebotCache[url] = raw
            except:
                pass
            sleep = sleep * 2
    return raw

def genJobDetails(mode='', domainlink='', joburl='', jobdate='', jobsize='', jobobjects='', jobaborted=False, jobproblem=False):
    jobdetails = ""
    if type(jobsize) is int:
        if jobsize < 1024:
            jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=%d | {{red|%s}} || data-sort-value=%s | %s' % (mode, domainlink, joburl, jobdate, jobsize, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects)
        else:
            jobcolor = 'green'
            if jobaborted:
                jobcolor = 'orange'
            if jobproblem:
                jobcolor = 'purple'
            jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=%d | {{%s|%s}} || data-sort-value=%s | %s' % (mode, domainlink, joburl, jobdate, jobsize, jobcolor, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects)
    else:
        jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=0 | %s || data-sort-value=%s | %s' % (mode, domainlink, joburl, jobdate, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects)
    return jobdetails

def getArchiveDetailsArchivebot(url='', singleurl=False):
    viewerurl = 'https://archive.fart.website/archivebot/viewer/?q=' + url
    origdomain = url.split('://')[1].split('/')[0]
    origdomain2 = re.sub(r'(?im)^(www\d*)\.', '.', origdomain)
    rawdomains = getURL(url=viewerurl, cache=False)
    domains = list(set(re.findall(r"(?im)/archivebot/viewer/domain/([^<>\"]+)", rawdomains)))
    if not domains: #no results for this url, remove cache
        removeFromArchivebotCache(url=viewerurl)
    details = []
    totaljobsize = 0
    jobslimit = 10 # before 10000
    for domain in domains:
        if domain != origdomain and not domain in origdomain and not origdomain2 in domain:
            continue
        urljobs = "https://archive.fart.website/archivebot/viewer/domain/" + domain
        rawjobs = getURL(url=urljobs, cache=False) #false, we want the most recent list of jobs always

        #remove unrelated jobs, for example googlesites, facebook, etc
        rawjobs2 = ""
        for rawjobcandidate in rawjobs.split("<tr>"):
            if url.split('://')[1].strip('/') in rawjobcandidate:
                rawjobs2 += rawjobcandidate
        rawjobs = rawjobs2

        jobs = re.findall(r"(?im)/archivebot/viewer/job/([^<>\"]+)\"", rawjobs)
        print("jobs", jobs)
        for jobid in jobs[:jobslimit]:
            jobidtruncated = jobid[-5:]
            urljob = "https://archive.fart.website/archivebot/viewer/job/" + jobid
            print(urljob)
            rawjob = getURL(url=urljob, cache=True)
            jsonfileurls = re.findall(r'(?im)<a href="(https://archive\.org/download/[^<>\" ]+\.json)">', rawjob)
            for jsonfileurl in jsonfileurls:
                print(jsonfileurl)
                if singleurl:
                    jsonraw = getURL(url=jsonfileurl, cache=True) #cache json from internet archive
                    try:
                        jsonfileloaded = json.loads(jsonraw)
                    except:
                        continue
                    if not 'url' in jsonfileloaded or ('url' in jsonfileloaded and jsonfileloaded['url'].strip('/') != url.strip('/')):
                        continue

                jobproblem = False
                warcs = re.findall(r"(?im)<a href=\"[^<>\" ]+?-(inf|shallow)-(\d{8})-(\d{6})-%s[^<> ]*?\.warc\.gz\">[^<>\"]*?</a>\s*</td>\s*<td>(\d+)</td>" % (jobidtruncated), rawjob)
                print(warcs)
                if not warcs:
                    jobproblem = True
                jobdatetimes = []
                for warc in warcs:
                    jobdatetimes.append("%s-%s" % (warc[1], warc[2]))
                jobdatetimes = list(set(jobdatetimes))
                jobdatetimes.sort()
                for jobdatetime in jobdatetimes:
                    if not jobdatetime in jsonfileurl:
                        continue
                    warcsnometa = len(re.findall(r"(?im)>\s*[^<>\"]+?-(inf|shallow)-(\d{8})-(\d{6})-%s-[^<> ]*?\d+\.warc\.gz" % (jobidtruncated), rawjob))
                    inforshallow = list(set(re.findall(r"(?im)>\s*[^<>\"]+?-(inf|shallow)-\d{8}-\d{6}-%s[^<> ]*?\.warc\.gz" % (jobidtruncated), rawjob)))

                    print(jobdatetime, warcsnometa, inforshallow)

                    inforshallow = len(inforshallow) == 1 and inforshallow[0] or 'unknown'
                    toolb = inforshallow == 'inf' and 'Recursive (!a)' or 'Shallow (!ao)'
                    jobaborted = False
                    if ('%s-%s-aborted-' % (jobdatetime, jobid)) in rawjob or ('%s-%s-aborted.json' % (jobdatetime, jobid)) in rawjob:
                        jobaborted = True
                    jobdate = '-' in jobdatetime and jobdatetime.split('-')[0] or 'nodate'
                    jobsize = sum([jobdatetime == '%s-%s' % (warc[1], warc[2]) and int(warc[3]) or 0 for warc in warcs])
                    if jobdate and jobdate != 'nodate':
                        jobdate = '%s-%s-%s' % (jobdate[0:4], jobdate[4:6], jobdate[6:8])
                    #jobdetails = genJobDetails(tool=toolb, domainlink="[https://archive.fart.website/archivebot/viewer/domain/%s %s]" % (domain, domain), joburl="[https://archive.fart.website/archivebot/viewer/job/%s %s]" % (jobidtruncated, jobidtruncated), jobdate=jobdate, jobsize=jobsize, jobobjects="%d warcs" % (warcsnometa), jobaborted=jobaborted, jobproblem=jobproblem)
                    jobdetails = genJobDetails(mode=toolb, domainlink=domain, joburl="{{ArchiveBot job|"+jobidtruncated+"}}", jobdate=jobdate, jobsize=jobsize, jobobjects="%d warcs" % (warcsnometa), jobaborted=jobaborted, jobproblem=jobproblem)
                    totaljobsize += jobsize
                    details.append(jobdetails)
    return details, totaljobsize

def getArchiveDetails(url=''):
    if url and '://' in url:
        if '://archive.org/' in url or \
           '://www.webcitation.org/' in url:
            return False, '', 0

        domain = url.split('://')[1].split('/')[0]
        if len(url.split(domain)[1]) > 1: #url is domain.ext/more
            details, totaljobsize = getArchiveDetailsArchivebot(url=url, singleurl=True)
            return details and True or False, '\n|-\n'.join(details), totaljobsize

        #url is domain.ext
        details, totaljobsize = getArchiveDetailsArchivebot(url=url, singleurl=False)
        return details and True or False, '\n|-\n'.join(details), totaljobsize

    return False, '', 0