#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# https://github.com/emijrp/internet-archive/raw/master/archiveteamfun.py
# Copyright (C) 2018-2019 Archive Team
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
import datetime
import gzip
import json
import os
import pickle
import random
import re
import sys
import _thread
import time
import unicodedata
import urllib
import urllib.request
import urllib.parse
ArchiveBotCacheFile = "/".join(__file__.split("/")[0:-1]) + "/archivebot.cache.pickle"
ArchivebotCache = {}
# Fix:
# https://archiveteam.org/index.php?title=ArchiveBot/2018_Brazilian_general_elections (portal.imprensanacional.gov.br no json = no saved?)
#
# Error no json:
"""Retry in 20 seconds...
Retrieving: https://archive.fart.website/archivebot/viewer/?q=https://transfer.notkiska.pw/kqFhq/twitter-@mattiastesfaye
Retrieving: https://archive.org/download/archiveteam_archivebot_go_20190514190001/urls-transfer.notkiska.pw-berries.space-accounts-09-May-2019-inf-20190511-012325-8grwh.json
"""
def convertsize(b=0): #bytes
if type(b) is int:
if b < 1024: #<1KiB
return '0 KiB'
elif b < 1024*1024: #<1MiB
return '%d KiB' % (b/(1024))
elif b < 1024*1024*1024: #<1GiB
return '%d MiB' % (b/(1024*1024))
elif b < 1024*1024*1024*1024: #<1TiB
return '%.1f GiB' % (b/(1024.0*1024*1024))
elif b < 1024*1024*1024*1024*1024: #<1PiB
return '%.1f TiB' % (b/(1024.0*1024*1024*1024))
elif b < 1024*1024*1024*1024*1024*1024: #<1EiB
return '%.1f PiB' % (b/(1024.0*1024*1024*1024*1024))
else:
return b
def loadArchivebotCache():
c = {}
if os.path.exists(ArchiveBotCacheFile):
with open(ArchiveBotCacheFile, 'rb') as f:
c = pickle.load(f)
return c.copy()
def removeFromArchivebotCache(url='', save=True):
global ArchivebotCache
if url and url in ArchivebotCache:
del ArchivebotCache[url]
if save:
saveArchivebotCache()
def saveArchivebotCache():
global ArchivebotCache
with open(ArchiveBotCacheFile, 'wb') as f:
pickle.dump(ArchivebotCache, f)
def cleanArchiveBotCache():
global ArchivebotCache
ArchivebotCache2 = ArchivebotCache.copy()
for url, raw in ArchivebotCache2.items():
#remove from cache urls without results
#we need to check for results in the next run
if url.startswith("https://archive.fart.website/archivebot/viewer/?q="):
if re.search(r'(?im)No search results.', raw):
removeFromArchivebotCache(url=url, save=False)
#remove from cache domains with many jobs (FB, TW, etc)
#these result pages change frequently
if url.startswith("https://archive.fart.website/archivebot/viewer/domain/"):
domain = url.split("https://archive.fart.website/archivebot/viewer/domain/")[1]
jobs = re.findall(r"(?im)/archivebot/viewer/job/([^<>\"]+)", raw)
if len(jobs) >= 10:
removeFromArchivebotCache(url=url, save=False)
#remove from cache jobs with problems or in progress
#we need to check wether problems were solved in the next run
if url.startswith("https://archive.fart.website/archivebot/viewer/job/"):
job = url.split("https://archive.fart.website/archivebot/viewer/job/")[1]
jsonfileurls = re.findall(r'(?im) ]+\.json)">', raw)
if not jsonfileurls and re.search(r'-%s\d{4}-\d{6}-' % (datetime.datetime.today().year), raw): #job in progress
removeFromArchivebotCache(url=url, save=False)
warcs = re.findall(r"(?im)>\s*[^<>\"]+?-(\d{8})-(\d{6})-%s[^<> ]*?\.warc\.gz\s*\s*\s*
(\d+) | " % (job), raw)
if not warcs and re.search(r'-%s\d{4}-\d{6}-' % (datetime.datetime.today().year), raw): #job in progress
removeFromArchivebotCache(url=url, save=False)
if 'borg.xyz/logs/' in url and not '.log' in url:
removeFromArchivebotCache(url=url, save=False)
saveArchivebotCache()
def getURL(url='', cache=False, retry=True):
global ArchivebotCache
if '8grwh' in url: #deleted jobs/jsons
return ''
if cache: #do not download if it is cached
if not ArchivebotCache: #empty dict
ArchivebotCache = loadArchivebotCache()
if url:
if url in ArchivebotCache:
#print("Using cached page for %s" % (url))
return ArchivebotCache[url]
raw = ''
#headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0' }
headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0, ArchiveWikiListGenerator) Gecko/20100101 Firefox/55.0' }
request = urllib.request.Request(url, headers=headers)
try:
print("Retrieving: %s" % (url))
response = urllib.request.urlopen(request)
if url.endswith('.gz'):
gzipFile = gzip.GzipFile(fileobj=response)
raw = gzipFile.read().strip().decode('utf-8')
else:
raw = response.read().strip().decode('utf-8')
if cache: #refresh cache
ArchivebotCache[url] = raw
if not random.randint(0, 100):
saveArchivebotCache()
except:
if url.endswith('.json'): #some .json are deleted on IA
return ''
sleep = 10 # seconds
maxsleep = 30
while retry and sleep <= maxsleep:
print('Error while retrieving: %s' % (url))
print('Retry in %s seconds...' % (sleep))
time.sleep(sleep)
try:
response = urllib.request.urlopen(request)
if url.endswith('.gz'):
gzipFile = gzip.GzipFile(fileobj=response)
raw = gzipFile.read().strip().decode('utf-8')
else:
raw = response.read().strip().decode('utf-8')
if cache: #refresh cache
ArchivebotCache[url] = raw
except:
pass
sleep = sleep * 2
return raw
def genJobDetails(mode='', domainlink='', joburl='', jobdate='', jobsize='', jobobjects='', jobaborted=False, jobproblem=False):
jobdetails = ""
if type(jobsize) is int:
if jobsize < 1024:
jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=%d | {{red|%s}} || data-sort-value=%s | %s' % (mode, domainlink, joburl, jobdate, jobsize, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects)
else:
jobcolor = 'green'
if jobaborted:
jobcolor = 'orange'
if jobproblem:
jobcolor = 'purple'
jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=%d | {{%s|%s}} || data-sort-value=%s | %s' % (mode, domainlink, joburl, jobdate, jobsize, jobcolor, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects)
else:
jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=0 | %s || data-sort-value=%s | %s' % (mode, domainlink, joburl, jobdate, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects)
return jobdetails
def getArchiveDetailsArchivebot(url='', singleurl=False):
viewerurl = 'https://archive.fart.website/archivebot/viewer/?q=' + url
origdomain = url.split('://')[1].split('/')[0]
origdomain2 = re.sub(r'(?im)^(www\d*)\.', '.', origdomain)
rawdomains = getURL(url=viewerurl, cache=False)
domains = list(set(re.findall(r"(?im)/archivebot/viewer/domain/([^<>\"]+)", rawdomains)))
if not domains: #no results for this url, remove cache
removeFromArchivebotCache(url=viewerurl)
details = []
totaljobsize = 0
jobslimit = 10 # before 10000
for domain in domains:
if domain != origdomain and not domain in origdomain and not origdomain2 in domain:
continue
urljobs = "https://archive.fart.website/archivebot/viewer/domain/" + domain
rawjobs = getURL(url=urljobs, cache=False) #false, we want the most recent list of jobs always
#remove unrelated jobs, for example googlesites, facebook, etc
rawjobs2 = ""
for rawjobcandidate in rawjobs.split(""):
if url.split('://')[1].strip('/') in rawjobcandidate:
rawjobs2 += rawjobcandidate
rawjobs = rawjobs2
jobs = re.findall(r"(?im)/archivebot/viewer/job/([^<>\"]+)\"", rawjobs)
print("jobs", jobs)
for jobid in jobs[:jobslimit]:
jobidtruncated = jobid[-5:]
urljob = "https://archive.fart.website/archivebot/viewer/job/" + jobid
print(urljob)
rawjob = getURL(url=urljob, cache=True)
jsonfileurls = re.findall(r'(?im)', rawjob)
for jsonfileurl in jsonfileurls:
print(jsonfileurl)
if singleurl:
jsonraw = getURL(url=jsonfileurl, cache=True) #cache json from internet archive
try:
jsonfileloaded = json.loads(jsonraw)
except:
continue
if not 'url' in jsonfileloaded or ('url' in jsonfileloaded and jsonfileloaded['url'].strip('/') != url.strip('/')):
continue
jobproblem = False
warcs = re.findall(r"(?im)\" ]+?-(inf|shallow)-(\d{8})-(\d{6})-%s[^<> ]*?\.warc\.gz\">[^<>\"]*?\s*\s*| (\d+) | " % (jobidtruncated), rawjob)
print(warcs)
if not warcs:
jobproblem = True
jobdatetimes = []
for warc in warcs:
jobdatetimes.append("%s-%s" % (warc[1], warc[2]))
jobdatetimes = list(set(jobdatetimes))
jobdatetimes.sort()
for jobdatetime in jobdatetimes:
if not jobdatetime in jsonfileurl:
continue
warcsnometa = len(re.findall(r"(?im)>\s*[^<>\"]+?-(inf|shallow)-(\d{8})-(\d{6})-%s-[^<> ]*?\d+\.warc\.gz" % (jobidtruncated), rawjob))
inforshallow = list(set(re.findall(r"(?im)>\s*[^<>\"]+?-(inf|shallow)-\d{8}-\d{6}-%s[^<> ]*?\.warc\.gz" % (jobidtruncated), rawjob)))
print(jobdatetime, warcsnometa, inforshallow)
inforshallow = len(inforshallow) == 1 and inforshallow[0] or 'unknown'
toolb = inforshallow == 'inf' and 'Recursive (!a)' or 'Shallow (!ao)'
jobaborted = False
if ('%s-%s-aborted-' % (jobdatetime, jobid)) in rawjob or ('%s-%s-aborted.json' % (jobdatetime, jobid)) in rawjob:
jobaborted = True
jobdate = '-' in jobdatetime and jobdatetime.split('-')[0] or 'nodate'
jobsize = sum([jobdatetime == '%s-%s' % (warc[1], warc[2]) and int(warc[3]) or 0 for warc in warcs])
if jobdate and jobdate != 'nodate':
jobdate = '%s-%s-%s' % (jobdate[0:4], jobdate[4:6], jobdate[6:8])
#jobdetails = genJobDetails(tool=toolb, domainlink="[https://archive.fart.website/archivebot/viewer/domain/%s %s]" % (domain, domain), joburl="[https://archive.fart.website/archivebot/viewer/job/%s %s]" % (jobidtruncated, jobidtruncated), jobdate=jobdate, jobsize=jobsize, jobobjects="%d warcs" % (warcsnometa), jobaborted=jobaborted, jobproblem=jobproblem)
jobdetails = genJobDetails(mode=toolb, domainlink=domain, joburl="{{ArchiveBot job|"+jobidtruncated+"}}", jobdate=jobdate, jobsize=jobsize, jobobjects="%d warcs" % (warcsnometa), jobaborted=jobaborted, jobproblem=jobproblem)
totaljobsize += jobsize
details.append(jobdetails)
return details, totaljobsize
def getArchiveDetails(url=''):
if url and '://' in url:
if '://archive.org/' in url or \
'://www.webcitation.org/' in url:
return False, '', 0
domain = url.split('://')[1].split('/')[0]
if len(url.split(domain)[1]) > 1: #url is domain.ext/more
details, totaljobsize = getArchiveDetailsArchivebot(url=url, singleurl=True)
return details and True or False, '\n|-\n'.join(details), totaljobsize
#url is domain.ext
details, totaljobsize = getArchiveDetailsArchivebot(url=url, singleurl=False)
return details and True or False, '\n|-\n'.join(details), totaljobsize
return False, '', 0