#!/usr/bin/env python3 # -*- coding: utf-8 -*- # https://github.com/emijrp/internet-archive/raw/master/archiveteamfun.py # Copyright (C) 2018-2019 Archive Team # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import datetime import gzip import json import os import pickle import random import re import sys import _thread import time import unicodedata import urllib import urllib.request import urllib.parse ArchiveBotCacheFile = "/".join(__file__.split("/")[0:-1]) + "/archivebot.cache.pickle" ArchivebotCache = {} # Fix: # https://archiveteam.org/index.php?title=ArchiveBot/2018_Brazilian_general_elections (portal.imprensanacional.gov.br no json = no saved?) # # Error no json: """Retry in 20 seconds... Retrieving: https://archive.fart.website/archivebot/viewer/?q=https://transfer.notkiska.pw/kqFhq/twitter-@mattiastesfaye Retrieving: https://archive.org/download/archiveteam_archivebot_go_20190514190001/urls-transfer.notkiska.pw-berries.space-accounts-09-May-2019-inf-20190511-012325-8grwh.json """ def convertsize(b=0): #bytes if type(b) is int: if b < 1024: #<1KiB return '0 KiB' elif b < 1024*1024: #<1MiB return '%d KiB' % (b/(1024)) elif b < 1024*1024*1024: #<1GiB return '%d MiB' % (b/(1024*1024)) elif b < 1024*1024*1024*1024: #<1TiB return '%.1f GiB' % (b/(1024.0*1024*1024)) elif b < 1024*1024*1024*1024*1024: #<1PiB return '%.1f TiB' % (b/(1024.0*1024*1024*1024)) elif b < 1024*1024*1024*1024*1024*1024: #<1EiB return '%.1f PiB' % (b/(1024.0*1024*1024*1024*1024)) else: return b def loadArchivebotCache(): c = {} if os.path.exists(ArchiveBotCacheFile): with open(ArchiveBotCacheFile, 'rb') as f: c = pickle.load(f) return c.copy() def removeFromArchivebotCache(url='', save=True): global ArchivebotCache if url and url in ArchivebotCache: del ArchivebotCache[url] if save: saveArchivebotCache() def saveArchivebotCache(): global ArchivebotCache with open(ArchiveBotCacheFile, 'wb') as f: pickle.dump(ArchivebotCache, f) def cleanArchiveBotCache(): global ArchivebotCache ArchivebotCache2 = ArchivebotCache.copy() for url, raw in ArchivebotCache2.items(): #remove from cache urls without results #we need to check for results in the next run if url.startswith("https://archive.fart.website/archivebot/viewer/?q="): if re.search(r'(?im)No search results.', raw): removeFromArchivebotCache(url=url, save=False) #remove from cache domains with many jobs (FB, TW, etc) #these result pages change frequently if url.startswith("https://archive.fart.website/archivebot/viewer/domain/"): domain = url.split("https://archive.fart.website/archivebot/viewer/domain/")[1] jobs = re.findall(r"(?im)/archivebot/viewer/job/([^<>\"]+)", raw) if len(jobs) >= 10: removeFromArchivebotCache(url=url, save=False) #remove from cache jobs with problems or in progress #we need to check wether problems were solved in the next run if url.startswith("https://archive.fart.website/archivebot/viewer/job/"): job = url.split("https://archive.fart.website/archivebot/viewer/job/")[1] jsonfileurls = re.findall(r'(?im) ]+\.json)">', raw) if not jsonfileurls and re.search(r'-%s\d{4}-\d{6}-' % (datetime.datetime.today().year), raw): #job in progress removeFromArchivebotCache(url=url, save=False) warcs = re.findall(r"(?im)>\s*[^<>\"]+?-(\d{8})-(\d{6})-%s[^<> ]*?\.warc\.gz\s*\s*\s*(\d+)" % (job), raw) if not warcs and re.search(r'-%s\d{4}-\d{6}-' % (datetime.datetime.today().year), raw): #job in progress removeFromArchivebotCache(url=url, save=False) if 'borg.xyz/logs/' in url and not '.log' in url: removeFromArchivebotCache(url=url, save=False) saveArchivebotCache() def getURL(url='', cache=False, retry=True): global ArchivebotCache if '8grwh' in url: #deleted jobs/jsons return '' if cache: #do not download if it is cached if not ArchivebotCache: #empty dict ArchivebotCache = loadArchivebotCache() if url: if url in ArchivebotCache: #print("Using cached page for %s" % (url)) return ArchivebotCache[url] raw = '' #headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0' } headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0, ArchiveWikiListGenerator) Gecko/20100101 Firefox/55.0' } request = urllib.request.Request(url, headers=headers) try: print("Retrieving: %s" % (url)) response = urllib.request.urlopen(request) if url.endswith('.gz'): gzipFile = gzip.GzipFile(fileobj=response) raw = gzipFile.read().strip().decode('utf-8') else: raw = response.read().strip().decode('utf-8') if cache: #refresh cache ArchivebotCache[url] = raw if not random.randint(0, 100): saveArchivebotCache() except: if url.endswith('.json'): #some .json are deleted on IA return '' sleep = 10 # seconds maxsleep = 30 while retry and sleep <= maxsleep: print('Error while retrieving: %s' % (url)) print('Retry in %s seconds...' % (sleep)) time.sleep(sleep) try: response = urllib.request.urlopen(request) if url.endswith('.gz'): gzipFile = gzip.GzipFile(fileobj=response) raw = gzipFile.read().strip().decode('utf-8') else: raw = response.read().strip().decode('utf-8') if cache: #refresh cache ArchivebotCache[url] = raw except: pass sleep = sleep * 2 return raw def genJobDetails(mode='', domainlink='', joburl='', jobdate='', jobsize='', jobobjects='', jobaborted=False, jobproblem=False): jobdetails = "" if type(jobsize) is int: if jobsize < 1024: jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=%d | {{red|%s}} || data-sort-value=%s | %s' % (mode, domainlink, joburl, jobdate, jobsize, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects) else: jobcolor = 'green' if jobaborted: jobcolor = 'orange' if jobproblem: jobcolor = 'purple' jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=%d | {{%s|%s}} || data-sort-value=%s | %s' % (mode, domainlink, joburl, jobdate, jobsize, jobcolor, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects) else: jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=0 | %s || data-sort-value=%s | %s' % (mode, domainlink, joburl, jobdate, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects) return jobdetails def getArchiveDetailsArchivebot(url='', singleurl=False): viewerurl = 'https://archive.fart.website/archivebot/viewer/?q=' + url origdomain = url.split('://')[1].split('/')[0] origdomain2 = re.sub(r'(?im)^(www\d*)\.', '.', origdomain) rawdomains = getURL(url=viewerurl, cache=False) domains = list(set(re.findall(r"(?im)/archivebot/viewer/domain/([^<>\"]+)", rawdomains))) if not domains: #no results for this url, remove cache removeFromArchivebotCache(url=viewerurl) details = [] totaljobsize = 0 jobslimit = 10 # before 10000 for domain in domains: if domain != origdomain and not domain in origdomain and not origdomain2 in domain: continue urljobs = "https://archive.fart.website/archivebot/viewer/domain/" + domain rawjobs = getURL(url=urljobs, cache=False) #false, we want the most recent list of jobs always #remove unrelated jobs, for example googlesites, facebook, etc rawjobs2 = "" for rawjobcandidate in rawjobs.split(""): if url.split('://')[1].strip('/') in rawjobcandidate: rawjobs2 += rawjobcandidate rawjobs = rawjobs2 jobs = re.findall(r"(?im)/archivebot/viewer/job/([^<>\"]+)\"", rawjobs) print("jobs", jobs) for jobid in jobs[:jobslimit]: jobidtruncated = jobid[-5:] urljob = "https://archive.fart.website/archivebot/viewer/job/" + jobid print(urljob) rawjob = getURL(url=urljob, cache=True) jsonfileurls = re.findall(r'(?im)', rawjob) for jsonfileurl in jsonfileurls: print(jsonfileurl) if singleurl: jsonraw = getURL(url=jsonfileurl, cache=True) #cache json from internet archive try: jsonfileloaded = json.loads(jsonraw) except: continue if not 'url' in jsonfileloaded or ('url' in jsonfileloaded and jsonfileloaded['url'].strip('/') != url.strip('/')): continue jobproblem = False warcs = re.findall(r"(?im)\" ]+?-(inf|shallow)-(\d{8})-(\d{6})-%s[^<> ]*?\.warc\.gz\">[^<>\"]*?\s*\s*(\d+)" % (jobidtruncated), rawjob) print(warcs) if not warcs: jobproblem = True jobdatetimes = [] for warc in warcs: jobdatetimes.append("%s-%s" % (warc[1], warc[2])) jobdatetimes = list(set(jobdatetimes)) jobdatetimes.sort() for jobdatetime in jobdatetimes: if not jobdatetime in jsonfileurl: continue warcsnometa = len(re.findall(r"(?im)>\s*[^<>\"]+?-(inf|shallow)-(\d{8})-(\d{6})-%s-[^<> ]*?\d+\.warc\.gz" % (jobidtruncated), rawjob)) inforshallow = list(set(re.findall(r"(?im)>\s*[^<>\"]+?-(inf|shallow)-\d{8}-\d{6}-%s[^<> ]*?\.warc\.gz" % (jobidtruncated), rawjob))) print(jobdatetime, warcsnometa, inforshallow) inforshallow = len(inforshallow) == 1 and inforshallow[0] or 'unknown' toolb = inforshallow == 'inf' and 'Recursive (!a)' or 'Shallow (!ao)' jobaborted = False if ('%s-%s-aborted-' % (jobdatetime, jobid)) in rawjob or ('%s-%s-aborted.json' % (jobdatetime, jobid)) in rawjob: jobaborted = True jobdate = '-' in jobdatetime and jobdatetime.split('-')[0] or 'nodate' jobsize = sum([jobdatetime == '%s-%s' % (warc[1], warc[2]) and int(warc[3]) or 0 for warc in warcs]) if jobdate and jobdate != 'nodate': jobdate = '%s-%s-%s' % (jobdate[0:4], jobdate[4:6], jobdate[6:8]) #jobdetails = genJobDetails(tool=toolb, domainlink="[https://archive.fart.website/archivebot/viewer/domain/%s %s]" % (domain, domain), joburl="[https://archive.fart.website/archivebot/viewer/job/%s %s]" % (jobidtruncated, jobidtruncated), jobdate=jobdate, jobsize=jobsize, jobobjects="%d warcs" % (warcsnometa), jobaborted=jobaborted, jobproblem=jobproblem) jobdetails = genJobDetails(mode=toolb, domainlink=domain, joburl="{{ArchiveBot job|"+jobidtruncated+"}}", jobdate=jobdate, jobsize=jobsize, jobobjects="%d warcs" % (warcsnometa), jobaborted=jobaborted, jobproblem=jobproblem) totaljobsize += jobsize details.append(jobdetails) return details, totaljobsize def getArchiveDetails(url=''): if url and '://' in url: if '://archive.org/' in url or \ '://www.webcitation.org/' in url: return False, '', 0 domain = url.split('://')[1].split('/')[0] if len(url.split(domain)[1]) > 1: #url is domain.ext/more details, totaljobsize = getArchiveDetailsArchivebot(url=url, singleurl=True) return details and True or False, '\n|-\n'.join(details), totaljobsize #url is domain.ext details, totaljobsize = getArchiveDetailsArchivebot(url=url, singleurl=False) return details and True or False, '\n|-\n'.join(details), totaljobsize return False, '', 0