From 34df4cd5eb5d945f456a0f4a3584438a5a850976 Mon Sep 17 00:00:00 2001 From: klea Date: Sun, 17 May 2026 13:28:12 +0000 Subject: wikibot: Move some scripts to wikibot-manual --- wikibot-manual/archivebot.py | 264 ++++++++++++++++++ wikibot-manual/archiveteamfun.py | 547 +++++++++++++++++++++++++++++++++++++ wikibot-manual/urlteam-torrents.py | 20 ++ wikibot/archivebot.py | 264 ------------------ wikibot/archiveteamfun.py | 547 ------------------------------------- wikibot/urlteam-torrents.py | 20 -- 6 files changed, 831 insertions(+), 831 deletions(-) create mode 100644 wikibot-manual/archivebot.py create mode 100644 wikibot-manual/archiveteamfun.py create mode 100644 wikibot-manual/urlteam-torrents.py delete mode 100644 wikibot/archivebot.py delete mode 100644 wikibot/archiveteamfun.py delete mode 100644 wikibot/urlteam-torrents.py diff --git a/wikibot-manual/archivebot.py b/wikibot-manual/archivebot.py new file mode 100644 index 0000000..7246a8d --- /dev/null +++ b/wikibot-manual/archivebot.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# https://github.com/emijrp/internet-archive/raw/master/archivebot.py + +# Copyright (C) 2018-2019 Archive Team +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import collections +import datetime +import json +import re +import sys +import time +import urllib.parse +import urllib.request +import pywikibot +import pywikibot.pagegenerators as pagegenerators + +from archiveteamfun import * + +Entry = collections.namedtuple('Entry', ('sorturl', 'url', 'label', 'note', 'line')) +truncationpattern = re.compile(r'^[^:/]+://(www\.)?') + +def parselistline(line): + label = None + note = None + if '|' in line: + url, rest = line.split('|', 1) + args = map(str.strip, rest.split('|')) + for position, arg in enumerate(args): + if '=' in arg: + key, value = map(str.strip, arg.split('=', 1)) + if key == 'label': + label = value + continue + elif key == 'note': + note = value + continue + # If it's neither, just treat it like it didn't have any '=' to begin with... + if position == 0: + label = arg + elif position == 1: + note = arg + # Everything else is ignored + else: + url = line + url = url.strip() + if '://' in url and not '/' in url.split('://')[1]: + url = url + '/' + line = url + (' | label = ' + label if label else '') + (' | note = ' + note if note else '') + sorturl = truncationpattern.sub('', url).lower() + for domain in ('transfer.sh', 'transfer.kiska.pw', 'transfer.notkiska.pw', 'ix.io'): + if domain == 'ix.io' and '+' not in sorturl: + # Only apply this stripping to the undocumented trick URLs of format ix.io/code+/filename + continue + if sorturl.startswith(domain) and sum(x == '/' for x in sorturl) == 2: + # For file hosting URLs that contain exactly two slashes, strip the first path component = the random file ID to sort by the filename instead. + sorturl = domain + sorturl[sorturl.index('/', len(domain) + 1):] + return Entry(sorturl = sorturl, url = url, label = label, note = note, line = line) + +def curateurls(wlist=''): + # Returns a dict of sectionname => list of URLs entries + # sectionname is None for URLs outside of a section (i.e. on a page without section or before the first section). + # A "URL entry" in the list is an Entry object (namedtuple); the label is None if it isn't present. + + lines = [] + currentsectionname = None + currentsectionentries = [] + sectionentries = {} + + def endsection(): + nonlocal currentsectionentries, lines, sectionentries, currentsectionname + currentsectionentries = list(set(currentsectionentries)) # Deduplicate + currentsectionentries.sort(key = lambda x: (x.sorturl, x.label if x.label is not None else '', x.url, x.note if x.note is not None else '', x.line)) + lines.extend(x.line for x in currentsectionentries) + sectionentries[currentsectionname] = currentsectionentries + currentsectionentries = [] + + for line in wlist.text.strip().splitlines(): + if line.strip().startswith('='): + # New section, sort and append previous section + endsection() + currentsectionname = line.strip().strip('=').strip() + if currentsectionname in sectionentries: + print('Warning: duplicate section name {!r} on page {}'.format(currentsectionname, wlist.title())) + if lines: + lines.append('') + lines.append(line.strip()) + elif line.strip(): + currentsectionentries.append(parselistline(line)) + endsection() + + lines = '\n'.join(lines) + if wlist.text != lines: + wlist.text = lines + wlist.save("BOT - Sorting list") + + return sectionentries + +def main(): + atsite = pywikibot.Site('en', 'ArchiveTeam') + cat = pywikibot.Category(atsite, "Category:ArchiveBot") + gen = pagegenerators.CategorizedPageGenerator(cat, start="!") + pre = pagegenerators.PreloadingGenerator(gen) + listlenlimit = 1000 + for page in pre: + wtitle = page.title() + wtext = page.text + + if len(sys.argv)>1 and not sys.argv[1] in wtitle: + continue + + if not wtitle.startswith('ArchiveBot/'): + continue + wlist = pywikibot.Page(atsite, '%s/list' % (wtitle)) + if not wlist.exists(): + print("Page %s/list doesnt exist" % (wtitle)) + continue + sectionentries = curateurls(wlist=wlist) + + print('\n===', wtitle, '===') + if (not '' in wtext and not '' in wtext: + print("No tag. Skiping...") + continue + if len(wlist.text.splitlines()) > listlenlimit: + continue + + newtext = [] + totaljobsize = 0 + totalsaved = 0 + totalnotsaved = 0 + + # Find blocks of page text that end with a bot tag + blocks = wtext.split('') + + # The last block must be tag-free, so only iterate over the previous ones + for block in blocks[:-1]: + # Find beginning of bot tag + pos = block.find('') + if pos == -1: + pos = block.find('') + continue + + if block[pos:].startswith(''): + # Sectionless tag, use section None + section = None + openingtag = '' + elif block[pos:].startswith('', pos) + if openend == -1: + print("Block's opening tag does not have an end, skipping...") + newtext.append(block) + newtext.append('') + continue + section = block[pos + 9:openend].strip() # 9 = len('') + continue + + if section not in sectionentries: + print('Block references section {!r} which does not exist, skipping...'.format(section)) + newtext.append(block) + newtext.append('') + continue + + # Add prefixed text (if any) + newtext.append(block[:pos]) + + # Add opening tag (as it was before) + newtext.append(openingtag) + + # Generate table + c = 1 + rowsplain = "" + sectionjobsize = 0 + sectionhasnotes = any(entry.note is not None for entry in sectionentries[section]) + for entry in sectionentries[section]: + viewerplain = '' + viewerdetailsplain = '' + viewer = [getArchiveDetails(url=entry.url)] + if viewer[0][0]: + viewerplain = "{{saved}}" + viewerdetailsplain = viewer[0][1] + sectionjobsize += viewer[0][2] + else: + viewerplain = "{{notsaved}}" + viewerdetailsplain = '' + rowspan = len(re.findall(r'\|-', viewerdetailsplain))+1 + rowspanplain = 'rowspan=%d | ' % (rowspan) if rowspan>1 else '' + if entry.label: + urllabel = '{{URLAB|1=%s|2=%s}}' % (entry.url, entry.label) + else: + urllabel = '{{URLAB|1=%s}}' % (entry.url) + if sectionhasnotes: + notescolumn = '%s%s || ' % (rowspanplain, entry.note if entry.note is not None else '') + else: + notescolumn = '' + rowsplain += "\n|-\n| %s%s || %s%s%s\n%s " % (rowspanplain, urllabel, notescolumn, rowspanplain, viewerplain, viewerdetailsplain if viewerdetailsplain else '| || || || || || ') + c += 1 + + totaljobsize += sectionjobsize + sectionsaved = rowsplain.count('{{saved}}') + totalsaved += sectionsaved + sectionnotsaved = rowsplain.count('{{notsaved}}') + totalnotsaved += sectionnotsaved + notesheader = 'rowspan=2 | Notes !! ' if sectionhasnotes else '' + output = """ +* '''Statistics''': {{saved}} (%s){{·}} {{notsaved}} (%s){{·}} Total size (%s) + +Do not edit this table, it is automatically updated by bot. There is a [[{{FULLPAGENAME}}/list|raw list]] of URLs that you can edit. + +{| class="wikitable sortable plainlinks" +! rowspan=2 | Website !! %srowspan=2 | Status !! colspan=6 | Archive details +|- +! Tool !! Domain !! Job !! Date !! Size !! Objects %s +|} +""" % (sectionsaved, sectionnotsaved, convertsize(b=sectionjobsize), notesheader, rowsplain) + newtext.append(output) + + newtext.append('') + + # Add the last, tag-free block + newtext.append(blocks[-1]) + + newtext = ''.join(newtext) + + # Replace total statistics if necessary + if '' in newtext: + newtext = re.sub(r'.*?', "'''Statistics''': {{saved}} (%s)){{·}} {{notsaved}} (%s){{·}} Total size (%s)" % (totalsaved, totalnotsaved, convertsize(b = totaljobsize)), newtext) + + if wtext != newtext: + pywikibot.showDiff(wtext, newtext) + page.text = newtext + try: + page.save("BOT - Updating page: {{saved}} (%s), {{notsaved}} (%s), Total size (%s)" % (totalsaved, totalnotsaved, convertsize(b=totaljobsize))) + except: + print("Error while saving...") + else: + print("No changes needed in", page.title()) + + cleanArchiveBotCache() + +if __name__ == '__main__': + main() diff --git a/wikibot-manual/archiveteamfun.py b/wikibot-manual/archiveteamfun.py new file mode 100644 index 0000000..2d5ff54 --- /dev/null +++ b/wikibot-manual/archiveteamfun.py @@ -0,0 +1,547 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# https://github.com/emijrp/internet-archive/raw/master/archiveteamfun.py + +# Copyright (C) 2018-2019 Archive Team +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import datetime +import gzip +import json +import os +import pickle +import random +import re +import sys +import _thread +import time +import unicodedata +import urllib +import urllib.request +import urllib.parse + +ArchivebotCache = {} +ChromebotCache = {} +NarabotCache = {} +WikiteamCache = {} +YoutubearchiveCache = {} + +# Ideas: +# Mirrortube (no channel info in metadata :() +# Videobot +# +# Fix: +# https://archiveteam.org/index.php?title=ArchiveBot/2018_Brazilian_general_elections (portal.imprensanacional.gov.br no json = no saved?) +# +# Error no json: +"""Retry in 20 seconds... +Retrieving: https://archive.fart.website/archivebot/viewer/?q=https://transfer.notkiska.pw/kqFhq/twitter-@mattiastesfaye +Retrieving: https://archive.org/download/archiveteam_archivebot_go_20190514190001/urls-transfer.notkiska.pw-berries.space-accounts-09-May-2019-inf-20190511-012325-8grwh.json +""" + +def convertsize(b=0): #bytes + if type(b) is int: + if b < 1024: #<1KiB + return '0 KiB' + elif b < 1024*1024: #<1MiB + return '%d KiB' % (b/(1024)) + elif b < 1024*1024*1024: #<1GiB + return '%d MiB' % (b/(1024*1024)) + elif b < 1024*1024*1024*1024: #<1TiB + return '%.1f GiB' % (b/(1024.0*1024*1024)) + elif b < 1024*1024*1024*1024*1024: #<1PiB + return '%.1f TiB' % (b/(1024.0*1024*1024*1024)) + elif b < 1024*1024*1024*1024*1024*1024: #<1EiB + return '%.1f PiB' % (b/(1024.0*1024*1024*1024*1024)) + else: + return b + +def loadArchivebotCache(): + c = {} + if os.path.exists('archivebot.cache'): + with open('archivebot.cache', 'rb') as f: + c = pickle.load(f) + return c.copy() + +def removeFromArchivebotCache(url='', save=True): + global ArchivebotCache + if url and url in ArchivebotCache: + del ArchivebotCache[url] + if save: + saveArchivebotCache() + +def saveArchivebotCache(): + global ArchivebotCache + with open('archivebot.cache', 'wb') as f: + pickle.dump(ArchivebotCache, f) + +def cleanArchiveBotCache(): + global ArchivebotCache + ArchivebotCache2 = ArchivebotCache.copy() + + for url, raw in ArchivebotCache2.items(): + #remove from cache urls without results + #we need to check for results in the next run + if url.startswith("https://archive.fart.website/archivebot/viewer/?q="): + if re.search(r'(?im)No search results.', raw): + removeFromArchivebotCache(url=url, save=False) + + #remove from cache domains with many jobs (FB, TW, etc) + #these result pages change frequently + if url.startswith("https://archive.fart.website/archivebot/viewer/domain/"): + domain = url.split("https://archive.fart.website/archivebot/viewer/domain/")[1] + jobs = re.findall(r"(?im)/archivebot/viewer/job/([^<>\"]+)", raw) + if len(jobs) >= 10: + removeFromArchivebotCache(url=url, save=False) + + #remove from cache jobs with problems or in progress + #we need to check wether problems were solved in the next run + if url.startswith("https://archive.fart.website/archivebot/viewer/job/"): + job = url.split("https://archive.fart.website/archivebot/viewer/job/")[1] + jsonfileurls = re.findall(r'(?im) ]+\.json)">', raw) + if not jsonfileurls and re.search(r'-%s\d{4}-\d{6}-' % (datetime.datetime.today().year), raw): #job in progress + removeFromArchivebotCache(url=url, save=False) + warcs = re.findall(r"(?im)>\s*[^<>\"]+?-(\d{8})-(\d{6})-%s[^<> ]*?\.warc\.gz\s*\s*\s*(\d+)" % (job), raw) + if not warcs and re.search(r'-%s\d{4}-\d{6}-' % (datetime.datetime.today().year), raw): #job in progress + removeFromArchivebotCache(url=url, save=False) + + if 'borg.xyz/logs/' in url and not '.log' in url: + removeFromArchivebotCache(url=url, save=False) + + saveArchivebotCache() + +def loadChromebotCache(): + c = {} + if os.path.exists('chromebot.cache'): + with open('chromebot.cache', 'rb') as f: + c = pickle.load(f) + firstcached = datetime.datetime(2019, 5, 7) + today = datetime.datetime.today() + iaquery = 'https://archive.org/advancedsearch.php?q=chromebot&fl[]=identifier&sort[]=publicdate+desc&sort[]=&sort[]=&rows=5000000&page=1&output=json' + raw = getURL(url=iaquery, cache=False) + json1 = json.loads(raw) + for item in json1["response"]["docs"]: + itemname = item['identifier'] + if not re.search(r'chromebot-\d\d\d\d-\d\d-\d\d-', itemname): + continue + itemdate = itemname.split('chromebot-')[1][:10] + itemdate = datetime.datetime(int(itemdate.split('-')[0]), int(itemdate.split('-')[1]), int(itemdate.split('-')[2])) + if itemdate >= firstcached and itemdate <= today: + if not itemdate.isoformat() in c: + c[itemdate.isoformat()] = [] + urlitem = 'https://archive.org/download/%s' % (itemname) + raw2 = getURL(url=urlitem, cache=False) + print('Loading .json for', item, itemdate) + urljson = '' + if '"jobs.json"' in raw2: + urljson = 'https://archive.org/download/%s/jobs.json' % (itemname) + elif '"jobs.json.gz"' in raw2: + urljson = 'https://archive.org/download/%s/jobs.json.gz' % (itemname) + if urljson: + raw3 = getURL(url=urljson, cache=False) + for line in raw3.splitlines(): + if line.startswith('{"id":'): + json2 = json.loads(line) + json2['item'] = itemname + c[itemdate.isoformat()].append(json2) + #print(c[itemdate.isoformat()][-1]['id']) + return c.copy() + +def saveChromebotCache(): + global ChromebotCache + with open('chromebot.cache', 'wb') as f: + pickle.dump(ChromebotCache, f) + +def loadNarabotCache(): + c = {} + if os.path.exists('narabot.cache'): + with open('narabot.cache', 'rb') as f: + c = pickle.load(f) + iaquery = 'https://archive.org/advancedsearch.php?q=collection%3Agithub_narabot_mirror&fl[]=identifier&fl[]=originalurl&sort[]=&sort[]=&sort[]=&rows=5000000&page=1&output=json' + raw = getURL(url=iaquery, cache=False) + json1 = json.loads(raw) + for item in json1["response"]["docs"]: + if not 'originalurl' in item: + continue + itemname = item['identifier'] + originalurl = item['originalurl'] + if type(originalurl) is list: + originalurl = originalurl[0] + c[itemname] = { 'originalurl': originalurl } + return c.copy() + +def saveNarabotCache(): + global NarabotCache + with open('narabot.cache', 'wb') as f: + pickle.dump(NarabotCache, f) + +def loadWikiteamCache(): + c = {} + if os.path.exists('wikiteam.cache'): + with open('wikiteam.cache', 'rb') as f: + c = pickle.load(f) + iaquery = 'https://archive.org/advancedsearch.php?q=collection%3Awikiteam&fl[]=identifier&fl[]=originalurl&sort[]=&sort[]=&sort[]=&rows=5000000&page=1&output=json' + raw = getURL(url=iaquery, cache=False) + json1 = json.loads(raw) + for item in json1["response"]["docs"]: + if not 'originalurl' in item: + continue + itemname = item['identifier'] + originalurl = item['originalurl'] + if type(originalurl) is list: + originalurl = originalurl[0] + #if not itemname.startswith('wiki-'): + # continue + c[itemname] = { 'originalurl': originalurl } + return c.copy() + +def saveWikiteamCache(): + global WikiteamCache + with open('wikiteam.cache', 'wb') as f: + pickle.dump(WikiteamCache, f) + +def loadYoutubearchiveCache(): + pass + +def saveYoutubearchiveCache(): + pass + +def getURL(url='', cache=False, retry=True): + global ArchivebotCache + + if '8grwh' in url: #deleted jobs/jsons + return '' + + if cache: #do not download if it is cached + if not ArchivebotCache: #empty dict + ArchivebotCache = loadArchivebotCache() + if url: + if url in ArchivebotCache: + #print("Using cached page for %s" % (url)) + return ArchivebotCache[url] + raw = '' + headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0' } + request = urllib.request.Request(url, headers=headers) + try: + print("Retrieving: %s" % (url)) + response = urllib.request.urlopen(request) + if url.endswith('.gz'): + gzipFile = gzip.GzipFile(fileobj=response) + raw = gzipFile.read().strip().decode('utf-8') + else: + raw = response.read().strip().decode('utf-8') + if cache: #refresh cache + ArchivebotCache[url] = raw + if not random.randint(0, 100): + saveArchivebotCache() + except: + if url.endswith('.json'): #some .json are deleted on IA + return '' + + sleep = 10 # seconds + maxsleep = 30 + while retry and sleep <= maxsleep: + print('Error while retrieving: %s' % (url)) + print('Retry in %s seconds...' % (sleep)) + time.sleep(sleep) + try: + response = urllib.request.urlopen(request) + if url.endswith('.gz'): + gzipFile = gzip.GzipFile(fileobj=response) + raw = gzipFile.read().strip().decode('utf-8') + else: + raw = response.read().strip().decode('utf-8') + if cache: #refresh cache + ArchivebotCache[url] = raw + except: + pass + sleep = sleep * 2 + return raw + +def loadSPARQL(sparql=''): + json1 = '' + if sparql: + try: + json1 = json.loads(sparql) + return json1 + except: + print('Error downloading SPARQL? Malformatted JSON? Skiping\n') + return + else: + print('Server return empty file') + return + return + +def genJobDetails(tool='', domainlink='', joburl='', jobdate='', jobsize='', jobobjects='', jobaborted=False, jobproblem=False): + jobdetails = "" + if type(jobsize) is int: + if jobsize < 1024: + jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=%d | {{red|%s}} || data-sort-value=%s | %s' % (tool, domainlink, joburl, jobdate, jobsize, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects) + else: + jobcolor = 'green' + if jobaborted: + jobcolor = 'orange' + if jobproblem: + jobcolor = 'purple' + jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=%d | {{%s|%s}} || data-sort-value=%s | %s' % (tool, domainlink, joburl, jobdate, jobsize, jobcolor, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects) + else: + jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=0 | %s || data-sort-value=%s | %s' % (tool, domainlink, joburl, jobdate, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects) + return jobdetails + +def getArchiveDetailsArchivebot(url='', singleurl=False): + viewerurl = 'https://archive.fart.website/archivebot/viewer/?q=' + url + origdomain = url.split('://')[1].split('/')[0] + origdomain2 = re.sub(r'(?im)^(www\d*)\.', '.', origdomain) + rawdomains = getURL(url=viewerurl, cache=False) + domains = list(set(re.findall(r"(?im)/archivebot/viewer/domain/([^<>\"]+)", rawdomains))) + if not domains: #no results for this url, remove cache + removeFromArchivebotCache(url=viewerurl) + details = [] + totaljobsize = 0 + jobslimit = 10 # before 10000 + tool = '[[ArchiveBot]]' + for domain in domains: + if domain != origdomain and not domain in origdomain and not origdomain2 in domain: + continue + urljobs = "https://archive.fart.website/archivebot/viewer/domain/" + domain + rawjobs = getURL(url=urljobs, cache=False) #false, we want the most recent list of jobs always + + #remove unrelated jobs, for example googlesites, facebook, etc + rawjobs2 = "" + for rawjobcandidate in rawjobs.split(""): + if url.split('://')[1].strip('/') in rawjobcandidate: + rawjobs2 += rawjobcandidate + rawjobs = rawjobs2 + + jobs = re.findall(r"(?im)/archivebot/viewer/job/([^<>\"]+)\"", rawjobs) + print("jobs", jobs) + for jobid in jobs[:jobslimit]: + jobidtruncated = jobid[-5:] + urljob = "https://archive.fart.website/archivebot/viewer/job/" + jobid + print(urljob) + rawjob = getURL(url=urljob, cache=True) + jsonfileurls = re.findall(r'(?im)', rawjob) + for jsonfileurl in jsonfileurls: + print(jsonfileurl) + if singleurl: + jsonraw = getURL(url=jsonfileurl, cache=True) #cache json from internet archive + try: + jsonfileloaded = json.loads(jsonraw) + except: + continue + if not 'url' in jsonfileloaded or ('url' in jsonfileloaded and jsonfileloaded['url'].strip('/') != url.strip('/')): + continue + + jobproblem = False + warcs = re.findall(r"(?im)\" ]+?-(inf|shallow)-(\d{8})-(\d{6})-%s[^<> ]*?\.warc\.gz\">[^<>\"]*?\s*\s*(\d+)" % (jobidtruncated), rawjob) + print(warcs) + if not warcs: + jobproblem = True + jobdatetimes = [] + for warc in warcs: + jobdatetimes.append("%s-%s" % (warc[1], warc[2])) + jobdatetimes = list(set(jobdatetimes)) + jobdatetimes.sort() + for jobdatetime in jobdatetimes: + if not jobdatetime in jsonfileurl: + continue + warcsnometa = len(re.findall(r"(?im)>\s*[^<>\"]+?-(inf|shallow)-(\d{8})-(\d{6})-%s-[^<> ]*?\d+\.warc\.gz" % (jobidtruncated), rawjob)) + inforshallow = list(set(re.findall(r"(?im)>\s*[^<>\"]+?-(inf|shallow)-\d{8}-\d{6}-%s[^<> ]*?\.warc\.gz" % (jobidtruncated), rawjob))) + + print(jobdatetime, warcsnometa, inforshallow) + + inforshallow = len(inforshallow) == 1 and inforshallow[0] or 'unknown' + toolb = "%s%s" % (tool, inforshallow == 'unknown' and '' or " (!%s)" % (inforshallow == 'inf' and 'a' or 'ao')) + jobaborted = False + if ('%s-%s-aborted-' % (jobdatetime, jobid)) in rawjob or ('%s-%s-aborted.json' % (jobdatetime, jobid)) in rawjob: + jobaborted = True + jobdate = '-' in jobdatetime and jobdatetime.split('-')[0] or 'nodate' + jobsize = sum([jobdatetime == '%s-%s' % (warc[1], warc[2]) and int(warc[3]) or 0 for warc in warcs]) + if jobdate and jobdate != 'nodate': + jobdate = '%s-%s-%s' % (jobdate[0:4], jobdate[4:6], jobdate[6:8]) + #jobdetails = genJobDetails(tool=toolb, domainlink="[https://archive.fart.website/archivebot/viewer/domain/%s %s]" % (domain, domain), joburl="[https://archive.fart.website/archivebot/viewer/job/%s %s]" % (jobidtruncated, jobidtruncated), jobdate=jobdate, jobsize=jobsize, jobobjects="%d warcs" % (warcsnometa), jobaborted=jobaborted, jobproblem=jobproblem) + jobdetails = genJobDetails(tool=toolb, domainlink=domain, joburl=jobidtruncated, jobdate=jobdate, jobsize=jobsize, jobobjects="%d warcs" % (warcsnometa), jobaborted=jobaborted, jobproblem=jobproblem) + totaljobsize += jobsize + details.append(jobdetails) + return details, totaljobsize + +def getArchiveDetailsChromebot(url='', singleurl=False): + global ChromebotCache + details = [] + totaljobsize = 0 + if not ChromebotCache: #empty dict + ChromebotCache = loadChromebotCache() + saveChromebotCache() + #{"id": "bajop-tomur-fagok-huzol", "user": "eientei95", "date": "2019-05-21T11:51:09.286515", "warcsize": 2775866, "url": "https://twitter.com/...", "urlcount": 1} + tool = '[[Chromebot]]' + for date, jobs in ChromebotCache.items(): + for job in jobs: + if job['url'] == url or ('urlseed' in job and job['urlseed'] == url): + domain = url.split('://')[1].split('/')[0] + jobid = job['id'].split('-')[-1] # last chunk seems unique + jobdate = '-' + if 'date' in job: + jobdate = job['date'].split('T')[0] + elif 'queued' in job: + jobdate = job['queued'].split('T')[0] + jobsize = '-' + if 'warcsize' in job: + jobsize = int(job['warcsize']) + jobobjects = '1 urls' + if 'urlcount' in job: + jobobjects = "%s urls" % (int(job['urlcount'])) + itemname = job['item'] + jobdetails = genJobDetails(tool=tool, domainlink=domain, joburl="[https://archive.org/download/%s %s]" % (itemname, jobid), jobdate=jobdate, jobsize=jobsize, jobobjects=jobobjects) + totaljobsize += jobsize + details.append(jobdetails) + return details, totaljobsize + +def getArchiveDetailsNarabot(url='', singleurl=False): + global NarabotCache + details = [] + totaljobsize = 0 + if not NarabotCache: #empty dict + NarabotCache = loadNarabotCache() + saveNarabotCache() + tool = '[[Narabot]]' + for itemname, props in NarabotCache.items(): + if props['originalurl'].strip('/').startswith(url.strip('/')): + domain = props['originalurl'].split('://')[1].split('/')[0] + urlfiles = 'https://archive.org/download/%s/%s_files.xml' % (itemname, itemname) + rawfiles = getURL(url=urlfiles, cache=True) + jobid = 'job' + jobdate = itemname.split('_-_')[1].split('_')[0] + jobsize = sum([int(x) for x in re.findall(r'(?im)(\d+)', rawfiles)]) + jobdetails = genJobDetails(tool=tool, domainlink=domain, joburl="[https://archive.org/download/%s %s]" % (itemname, jobid), jobdate=jobdate, jobsize=jobsize, jobobjects="1 repo") + totaljobsize += jobsize + details.append(jobdetails) + return details, totaljobsize + +def getArchiveDetailsWikiteam(url='', singleurl=False): + global WikiteamCache + details = [] + totaljobsize = 0 + if not WikiteamCache: #empty dict + WikiteamCache = loadWikiteamCache() + saveWikiteamCache() + tool = '[[WikiTeam]]' + for itemname, props in WikiteamCache.items(): + itemname_ = re.sub(r'(?im)^wiki-', '', itemname) + if props['originalurl'].strip('/').startswith(url.strip('/')): + #if item files follows wikidump/history filename style, we use every file in item like a different job + #otherwise we count just 1 job and sum all file sizes + domain = props['originalurl'].split('://')[1].split('/')[0] + urlfiles = 'https://archive.org/download/%s/%s_files.xml' % (itemname, itemname) + rawfiles = getURL(url=urlfiles, cache=True) + isstandard = re.search(r'(?im)' % (itemname_), rawfiles) and True or False + if isstandard: + for xfile in rawfiles.split(''): + jobid = 'job' + jobdate = 'date' + jobsize = re.findall(r'(?im)(\d+)', xfile) and int(re.findall(r'(?im)(\d+)', xfile)[0]) or 0 + m = re.findall(r'(?im)' % (itemname_), xfile) + if m: + m = m[0] + jobid = m[3] + jobdate = '%s-%s-%s' % (m[2][0:4], m[2][4:6], m[2][6:8]) + jobdetails = genJobDetails(tool=tool, domainlink=domain, joburl="[https://archive.org/download/%s %s]" % (itemname, jobid), jobdate=jobdate, jobsize=jobsize, jobobjects="1 dump") + totaljobsize += jobsize + details.append(jobdetails) + else: + jobid = 'other' + jobdate = re.findall(r'(?im)(\d+)', rawfiles) and int(re.findall(r'(?im)(\d+)', rawfiles)[0]) or 'date' + if type(jobdate) is int: + jobdate = datetime.datetime.utcfromtimestamp(jobdate).strftime('%Y-%m-%d') + jobsize = sum([int(x) for x in re.findall(r'(?im)(\d+)', rawfiles)]) + jobdetails = genJobDetails(tool=tool, domainlink=domain, joburl="[https://archive.org/download/%s %s]" % (itemname, jobid), jobdate=jobdate, jobsize=jobsize) + totaljobsize += jobsize + details.append(jobdetails) + return details, totaljobsize + +def getArchiveDetailsYoutubearchive(url='', singleurl=False): + global YoutubearchiveCache + details = [] + totaljobsize = 0 + if not YoutubearchiveCache: #empty dict + YoutubearchiveCache = loadYoutubearchiveCache() + saveYoutubearchiveCache() + tool = '[[YouTube|ytarchive]]' + if re.search(r'https://www\.youtube\.com/(channel|user)/[^/]+', url): + domain = url.split('://')[1].split('/')[0] + channelid = url.split('/')[4].split('/')[0] + urlytarchive = 'https://ya.borg.xyz/logs/dl/?C=M;O=D' + rawytarchive = getURL(url=urlytarchive, cache=True) + channels = re.findall(r'(?im)', rawytarchive) + if channelid in channels: + urlytarchive2 = 'https://ya.borg.xyz/logs/dl/%s/?C=M;O=D' % (channelid) + rawytarchive2 = getURL(url=urlytarchive2, cache=True) + logs = re.findall(r'(?im)', rawytarchive2) + if logs: + logfilename = logs[0] + urlytarchive3 = 'https://ya.borg.xyz/logs/dl/%s/%s' % (channelid, logfilename) + rawytarchive3 = getURL(url=urlytarchive3, cache=True) + if re.search(r'Finished downloading playlist', rawytarchive3): + jobid = '-' + jobdate = logfilename.split('T')[0] + jobsize = '-' + jobobjects = '-' + if re.search(r'(?im)Downloading video (\d+) of \1$', rawytarchive3): + numvideos = int(re.findall(r'(?im)Downloading video (\d+) of \1$', rawytarchive3)[0]) + numerrors = re.findall(r'ERROR: ', rawytarchive3) and len(re.findall(r'ERROR: ', rawytarchive3)[0]) or 0 + jobobjects = "%s videos" % (numvideos-numerrors) + jobdetails = genJobDetails(tool=tool, domainlink=domain, joburl="%s" % (jobid), jobdate=jobdate, jobsize=jobsize, jobobjects=jobobjects) + if type(jobsize) is int: + totaljobsize += jobsize + details.append(jobdetails) + return details, totaljobsize + +def getArchiveDetailsCore(url='', singleurl=False): + detailsArchivebot, totaljobsizeArchivebot = getArchiveDetailsArchivebot(url=url, singleurl=singleurl) + """ + deprecated archives + + detailsChromebot, totaljobsizeChromebot = getArchiveDetailsChromebot(url=url, singleurl=singleurl) + detailsNarabot, totaljobsizeNarabot = getArchiveDetailsNarabot(url=url, singleurl=singleurl) + detailsWikiteam, totaljobsizeWikiteam = getArchiveDetailsWikiteam(url=url, singleurl=singleurl) + detailsYoutubearchive, totaljobsizeYoutubearchive = getArchiveDetailsYoutubearchive(url=url, singleurl=singleurl) + """ + + """ + details = detailsArchivebot + detailsChromebot + detailsNarabot + detailsWikiteam + detailsYoutubearchive + totaljobsize = totaljobsizeArchivebot + totaljobsizeChromebot + totaljobsizeNarabot + totaljobsizeWikiteam + totaljobsizeYoutubearchive + """ + + details = detailsArchivebot + totaljobsize = totaljobsizeArchivebot + + details.sort() + detailsplain = '\n|-\n'.join(details) + return detailsplain, totaljobsize + +def getArchiveDetails(url=''): + if url and '://' in url: + if '://archive.org/' in url or \ + '://www.webcitation.org/' in url: + return False, '', 0 + + domain = url.split('://')[1].split('/')[0] + if len(url.split(domain)[1]) > 1: #url is domain.ext/more + details, totaljobsize = getArchiveDetailsCore(url=url, singleurl=True) + return details and True or False, details, totaljobsize + + #url is domain.ext + details, totaljobsize = getArchiveDetailsCore(url=url, singleurl=False) + return details and True or False, details, totaljobsize + + return False, '', 0 diff --git a/wikibot-manual/urlteam-torrents.py b/wikibot-manual/urlteam-torrents.py new file mode 100644 index 0000000..b6b205d --- /dev/null +++ b/wikibot-manual/urlteam-torrents.py @@ -0,0 +1,20 @@ +import pywikibot +import requests + +def main(): + site = pywikibot.Site('en', 'ArchiveTeam') + page = pywikibot.Page(site, 'URLTeam/torrents') + + IAResponse = requests.get('https://archive.org/services/search/v1/scrape?q=subject:terroroftinytown&count=10000') + IAItems = IAResponse.json()['items'] + output = [f"https://archive.org/download/{IAItem['identifier']}/{IAItem['identifier']}_archive.torrent" for IAItem in IAItems] + outputStr = '
\n' + '\n'.join(output) + '\n
' + + # Update if necessary + if page.text != outputStr: + site.login() # Only log in when necessary + page.text = outputStr + page.save("Updated torrent list") + + +main() diff --git a/wikibot/archivebot.py b/wikibot/archivebot.py deleted file mode 100644 index 7246a8d..0000000 --- a/wikibot/archivebot.py +++ /dev/null @@ -1,264 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# https://github.com/emijrp/internet-archive/raw/master/archivebot.py - -# Copyright (C) 2018-2019 Archive Team -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -import collections -import datetime -import json -import re -import sys -import time -import urllib.parse -import urllib.request -import pywikibot -import pywikibot.pagegenerators as pagegenerators - -from archiveteamfun import * - -Entry = collections.namedtuple('Entry', ('sorturl', 'url', 'label', 'note', 'line')) -truncationpattern = re.compile(r'^[^:/]+://(www\.)?') - -def parselistline(line): - label = None - note = None - if '|' in line: - url, rest = line.split('|', 1) - args = map(str.strip, rest.split('|')) - for position, arg in enumerate(args): - if '=' in arg: - key, value = map(str.strip, arg.split('=', 1)) - if key == 'label': - label = value - continue - elif key == 'note': - note = value - continue - # If it's neither, just treat it like it didn't have any '=' to begin with... - if position == 0: - label = arg - elif position == 1: - note = arg - # Everything else is ignored - else: - url = line - url = url.strip() - if '://' in url and not '/' in url.split('://')[1]: - url = url + '/' - line = url + (' | label = ' + label if label else '') + (' | note = ' + note if note else '') - sorturl = truncationpattern.sub('', url).lower() - for domain in ('transfer.sh', 'transfer.kiska.pw', 'transfer.notkiska.pw', 'ix.io'): - if domain == 'ix.io' and '+' not in sorturl: - # Only apply this stripping to the undocumented trick URLs of format ix.io/code+/filename - continue - if sorturl.startswith(domain) and sum(x == '/' for x in sorturl) == 2: - # For file hosting URLs that contain exactly two slashes, strip the first path component = the random file ID to sort by the filename instead. - sorturl = domain + sorturl[sorturl.index('/', len(domain) + 1):] - return Entry(sorturl = sorturl, url = url, label = label, note = note, line = line) - -def curateurls(wlist=''): - # Returns a dict of sectionname => list of URLs entries - # sectionname is None for URLs outside of a section (i.e. on a page without section or before the first section). - # A "URL entry" in the list is an Entry object (namedtuple); the label is None if it isn't present. - - lines = [] - currentsectionname = None - currentsectionentries = [] - sectionentries = {} - - def endsection(): - nonlocal currentsectionentries, lines, sectionentries, currentsectionname - currentsectionentries = list(set(currentsectionentries)) # Deduplicate - currentsectionentries.sort(key = lambda x: (x.sorturl, x.label if x.label is not None else '', x.url, x.note if x.note is not None else '', x.line)) - lines.extend(x.line for x in currentsectionentries) - sectionentries[currentsectionname] = currentsectionentries - currentsectionentries = [] - - for line in wlist.text.strip().splitlines(): - if line.strip().startswith('='): - # New section, sort and append previous section - endsection() - currentsectionname = line.strip().strip('=').strip() - if currentsectionname in sectionentries: - print('Warning: duplicate section name {!r} on page {}'.format(currentsectionname, wlist.title())) - if lines: - lines.append('') - lines.append(line.strip()) - elif line.strip(): - currentsectionentries.append(parselistline(line)) - endsection() - - lines = '\n'.join(lines) - if wlist.text != lines: - wlist.text = lines - wlist.save("BOT - Sorting list") - - return sectionentries - -def main(): - atsite = pywikibot.Site('en', 'ArchiveTeam') - cat = pywikibot.Category(atsite, "Category:ArchiveBot") - gen = pagegenerators.CategorizedPageGenerator(cat, start="!") - pre = pagegenerators.PreloadingGenerator(gen) - listlenlimit = 1000 - for page in pre: - wtitle = page.title() - wtext = page.text - - if len(sys.argv)>1 and not sys.argv[1] in wtitle: - continue - - if not wtitle.startswith('ArchiveBot/'): - continue - wlist = pywikibot.Page(atsite, '%s/list' % (wtitle)) - if not wlist.exists(): - print("Page %s/list doesnt exist" % (wtitle)) - continue - sectionentries = curateurls(wlist=wlist) - - print('\n===', wtitle, '===') - if (not '' in wtext and not '' in wtext: - print("No tag. Skiping...") - continue - if len(wlist.text.splitlines()) > listlenlimit: - continue - - newtext = [] - totaljobsize = 0 - totalsaved = 0 - totalnotsaved = 0 - - # Find blocks of page text that end with a bot tag - blocks = wtext.split('') - - # The last block must be tag-free, so only iterate over the previous ones - for block in blocks[:-1]: - # Find beginning of bot tag - pos = block.find('') - if pos == -1: - pos = block.find('') - continue - - if block[pos:].startswith(''): - # Sectionless tag, use section None - section = None - openingtag = '' - elif block[pos:].startswith('', pos) - if openend == -1: - print("Block's opening tag does not have an end, skipping...") - newtext.append(block) - newtext.append('') - continue - section = block[pos + 9:openend].strip() # 9 = len('') - continue - - if section not in sectionentries: - print('Block references section {!r} which does not exist, skipping...'.format(section)) - newtext.append(block) - newtext.append('') - continue - - # Add prefixed text (if any) - newtext.append(block[:pos]) - - # Add opening tag (as it was before) - newtext.append(openingtag) - - # Generate table - c = 1 - rowsplain = "" - sectionjobsize = 0 - sectionhasnotes = any(entry.note is not None for entry in sectionentries[section]) - for entry in sectionentries[section]: - viewerplain = '' - viewerdetailsplain = '' - viewer = [getArchiveDetails(url=entry.url)] - if viewer[0][0]: - viewerplain = "{{saved}}" - viewerdetailsplain = viewer[0][1] - sectionjobsize += viewer[0][2] - else: - viewerplain = "{{notsaved}}" - viewerdetailsplain = '' - rowspan = len(re.findall(r'\|-', viewerdetailsplain))+1 - rowspanplain = 'rowspan=%d | ' % (rowspan) if rowspan>1 else '' - if entry.label: - urllabel = '{{URLAB|1=%s|2=%s}}' % (entry.url, entry.label) - else: - urllabel = '{{URLAB|1=%s}}' % (entry.url) - if sectionhasnotes: - notescolumn = '%s%s || ' % (rowspanplain, entry.note if entry.note is not None else '') - else: - notescolumn = '' - rowsplain += "\n|-\n| %s%s || %s%s%s\n%s " % (rowspanplain, urllabel, notescolumn, rowspanplain, viewerplain, viewerdetailsplain if viewerdetailsplain else '| || || || || || ') - c += 1 - - totaljobsize += sectionjobsize - sectionsaved = rowsplain.count('{{saved}}') - totalsaved += sectionsaved - sectionnotsaved = rowsplain.count('{{notsaved}}') - totalnotsaved += sectionnotsaved - notesheader = 'rowspan=2 | Notes !! ' if sectionhasnotes else '' - output = """ -* '''Statistics''': {{saved}} (%s){{·}} {{notsaved}} (%s){{·}} Total size (%s) - -Do not edit this table, it is automatically updated by bot. There is a [[{{FULLPAGENAME}}/list|raw list]] of URLs that you can edit. - -{| class="wikitable sortable plainlinks" -! rowspan=2 | Website !! %srowspan=2 | Status !! colspan=6 | Archive details -|- -! Tool !! Domain !! Job !! Date !! Size !! Objects %s -|} -""" % (sectionsaved, sectionnotsaved, convertsize(b=sectionjobsize), notesheader, rowsplain) - newtext.append(output) - - newtext.append('') - - # Add the last, tag-free block - newtext.append(blocks[-1]) - - newtext = ''.join(newtext) - - # Replace total statistics if necessary - if '' in newtext: - newtext = re.sub(r'.*?', "'''Statistics''': {{saved}} (%s)){{·}} {{notsaved}} (%s){{·}} Total size (%s)" % (totalsaved, totalnotsaved, convertsize(b = totaljobsize)), newtext) - - if wtext != newtext: - pywikibot.showDiff(wtext, newtext) - page.text = newtext - try: - page.save("BOT - Updating page: {{saved}} (%s), {{notsaved}} (%s), Total size (%s)" % (totalsaved, totalnotsaved, convertsize(b=totaljobsize))) - except: - print("Error while saving...") - else: - print("No changes needed in", page.title()) - - cleanArchiveBotCache() - -if __name__ == '__main__': - main() diff --git a/wikibot/archiveteamfun.py b/wikibot/archiveteamfun.py deleted file mode 100644 index 2d5ff54..0000000 --- a/wikibot/archiveteamfun.py +++ /dev/null @@ -1,547 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# https://github.com/emijrp/internet-archive/raw/master/archiveteamfun.py - -# Copyright (C) 2018-2019 Archive Team -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -import datetime -import gzip -import json -import os -import pickle -import random -import re -import sys -import _thread -import time -import unicodedata -import urllib -import urllib.request -import urllib.parse - -ArchivebotCache = {} -ChromebotCache = {} -NarabotCache = {} -WikiteamCache = {} -YoutubearchiveCache = {} - -# Ideas: -# Mirrortube (no channel info in metadata :() -# Videobot -# -# Fix: -# https://archiveteam.org/index.php?title=ArchiveBot/2018_Brazilian_general_elections (portal.imprensanacional.gov.br no json = no saved?) -# -# Error no json: -"""Retry in 20 seconds... -Retrieving: https://archive.fart.website/archivebot/viewer/?q=https://transfer.notkiska.pw/kqFhq/twitter-@mattiastesfaye -Retrieving: https://archive.org/download/archiveteam_archivebot_go_20190514190001/urls-transfer.notkiska.pw-berries.space-accounts-09-May-2019-inf-20190511-012325-8grwh.json -""" - -def convertsize(b=0): #bytes - if type(b) is int: - if b < 1024: #<1KiB - return '0 KiB' - elif b < 1024*1024: #<1MiB - return '%d KiB' % (b/(1024)) - elif b < 1024*1024*1024: #<1GiB - return '%d MiB' % (b/(1024*1024)) - elif b < 1024*1024*1024*1024: #<1TiB - return '%.1f GiB' % (b/(1024.0*1024*1024)) - elif b < 1024*1024*1024*1024*1024: #<1PiB - return '%.1f TiB' % (b/(1024.0*1024*1024*1024)) - elif b < 1024*1024*1024*1024*1024*1024: #<1EiB - return '%.1f PiB' % (b/(1024.0*1024*1024*1024*1024)) - else: - return b - -def loadArchivebotCache(): - c = {} - if os.path.exists('archivebot.cache'): - with open('archivebot.cache', 'rb') as f: - c = pickle.load(f) - return c.copy() - -def removeFromArchivebotCache(url='', save=True): - global ArchivebotCache - if url and url in ArchivebotCache: - del ArchivebotCache[url] - if save: - saveArchivebotCache() - -def saveArchivebotCache(): - global ArchivebotCache - with open('archivebot.cache', 'wb') as f: - pickle.dump(ArchivebotCache, f) - -def cleanArchiveBotCache(): - global ArchivebotCache - ArchivebotCache2 = ArchivebotCache.copy() - - for url, raw in ArchivebotCache2.items(): - #remove from cache urls without results - #we need to check for results in the next run - if url.startswith("https://archive.fart.website/archivebot/viewer/?q="): - if re.search(r'(?im)No search results.', raw): - removeFromArchivebotCache(url=url, save=False) - - #remove from cache domains with many jobs (FB, TW, etc) - #these result pages change frequently - if url.startswith("https://archive.fart.website/archivebot/viewer/domain/"): - domain = url.split("https://archive.fart.website/archivebot/viewer/domain/")[1] - jobs = re.findall(r"(?im)/archivebot/viewer/job/([^<>\"]+)", raw) - if len(jobs) >= 10: - removeFromArchivebotCache(url=url, save=False) - - #remove from cache jobs with problems or in progress - #we need to check wether problems were solved in the next run - if url.startswith("https://archive.fart.website/archivebot/viewer/job/"): - job = url.split("https://archive.fart.website/archivebot/viewer/job/")[1] - jsonfileurls = re.findall(r'(?im)
]+\.json)">', raw) - if not jsonfileurls and re.search(r'-%s\d{4}-\d{6}-' % (datetime.datetime.today().year), raw): #job in progress - removeFromArchivebotCache(url=url, save=False) - warcs = re.findall(r"(?im)>\s*[^<>\"]+?-(\d{8})-(\d{6})-%s[^<> ]*?\.warc\.gz\s*\s*\s*(\d+)" % (job), raw) - if not warcs and re.search(r'-%s\d{4}-\d{6}-' % (datetime.datetime.today().year), raw): #job in progress - removeFromArchivebotCache(url=url, save=False) - - if 'borg.xyz/logs/' in url and not '.log' in url: - removeFromArchivebotCache(url=url, save=False) - - saveArchivebotCache() - -def loadChromebotCache(): - c = {} - if os.path.exists('chromebot.cache'): - with open('chromebot.cache', 'rb') as f: - c = pickle.load(f) - firstcached = datetime.datetime(2019, 5, 7) - today = datetime.datetime.today() - iaquery = 'https://archive.org/advancedsearch.php?q=chromebot&fl[]=identifier&sort[]=publicdate+desc&sort[]=&sort[]=&rows=5000000&page=1&output=json' - raw = getURL(url=iaquery, cache=False) - json1 = json.loads(raw) - for item in json1["response"]["docs"]: - itemname = item['identifier'] - if not re.search(r'chromebot-\d\d\d\d-\d\d-\d\d-', itemname): - continue - itemdate = itemname.split('chromebot-')[1][:10] - itemdate = datetime.datetime(int(itemdate.split('-')[0]), int(itemdate.split('-')[1]), int(itemdate.split('-')[2])) - if itemdate >= firstcached and itemdate <= today: - if not itemdate.isoformat() in c: - c[itemdate.isoformat()] = [] - urlitem = 'https://archive.org/download/%s' % (itemname) - raw2 = getURL(url=urlitem, cache=False) - print('Loading .json for', item, itemdate) - urljson = '' - if '"jobs.json"' in raw2: - urljson = 'https://archive.org/download/%s/jobs.json' % (itemname) - elif '"jobs.json.gz"' in raw2: - urljson = 'https://archive.org/download/%s/jobs.json.gz' % (itemname) - if urljson: - raw3 = getURL(url=urljson, cache=False) - for line in raw3.splitlines(): - if line.startswith('{"id":'): - json2 = json.loads(line) - json2['item'] = itemname - c[itemdate.isoformat()].append(json2) - #print(c[itemdate.isoformat()][-1]['id']) - return c.copy() - -def saveChromebotCache(): - global ChromebotCache - with open('chromebot.cache', 'wb') as f: - pickle.dump(ChromebotCache, f) - -def loadNarabotCache(): - c = {} - if os.path.exists('narabot.cache'): - with open('narabot.cache', 'rb') as f: - c = pickle.load(f) - iaquery = 'https://archive.org/advancedsearch.php?q=collection%3Agithub_narabot_mirror&fl[]=identifier&fl[]=originalurl&sort[]=&sort[]=&sort[]=&rows=5000000&page=1&output=json' - raw = getURL(url=iaquery, cache=False) - json1 = json.loads(raw) - for item in json1["response"]["docs"]: - if not 'originalurl' in item: - continue - itemname = item['identifier'] - originalurl = item['originalurl'] - if type(originalurl) is list: - originalurl = originalurl[0] - c[itemname] = { 'originalurl': originalurl } - return c.copy() - -def saveNarabotCache(): - global NarabotCache - with open('narabot.cache', 'wb') as f: - pickle.dump(NarabotCache, f) - -def loadWikiteamCache(): - c = {} - if os.path.exists('wikiteam.cache'): - with open('wikiteam.cache', 'rb') as f: - c = pickle.load(f) - iaquery = 'https://archive.org/advancedsearch.php?q=collection%3Awikiteam&fl[]=identifier&fl[]=originalurl&sort[]=&sort[]=&sort[]=&rows=5000000&page=1&output=json' - raw = getURL(url=iaquery, cache=False) - json1 = json.loads(raw) - for item in json1["response"]["docs"]: - if not 'originalurl' in item: - continue - itemname = item['identifier'] - originalurl = item['originalurl'] - if type(originalurl) is list: - originalurl = originalurl[0] - #if not itemname.startswith('wiki-'): - # continue - c[itemname] = { 'originalurl': originalurl } - return c.copy() - -def saveWikiteamCache(): - global WikiteamCache - with open('wikiteam.cache', 'wb') as f: - pickle.dump(WikiteamCache, f) - -def loadYoutubearchiveCache(): - pass - -def saveYoutubearchiveCache(): - pass - -def getURL(url='', cache=False, retry=True): - global ArchivebotCache - - if '8grwh' in url: #deleted jobs/jsons - return '' - - if cache: #do not download if it is cached - if not ArchivebotCache: #empty dict - ArchivebotCache = loadArchivebotCache() - if url: - if url in ArchivebotCache: - #print("Using cached page for %s" % (url)) - return ArchivebotCache[url] - raw = '' - headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0' } - request = urllib.request.Request(url, headers=headers) - try: - print("Retrieving: %s" % (url)) - response = urllib.request.urlopen(request) - if url.endswith('.gz'): - gzipFile = gzip.GzipFile(fileobj=response) - raw = gzipFile.read().strip().decode('utf-8') - else: - raw = response.read().strip().decode('utf-8') - if cache: #refresh cache - ArchivebotCache[url] = raw - if not random.randint(0, 100): - saveArchivebotCache() - except: - if url.endswith('.json'): #some .json are deleted on IA - return '' - - sleep = 10 # seconds - maxsleep = 30 - while retry and sleep <= maxsleep: - print('Error while retrieving: %s' % (url)) - print('Retry in %s seconds...' % (sleep)) - time.sleep(sleep) - try: - response = urllib.request.urlopen(request) - if url.endswith('.gz'): - gzipFile = gzip.GzipFile(fileobj=response) - raw = gzipFile.read().strip().decode('utf-8') - else: - raw = response.read().strip().decode('utf-8') - if cache: #refresh cache - ArchivebotCache[url] = raw - except: - pass - sleep = sleep * 2 - return raw - -def loadSPARQL(sparql=''): - json1 = '' - if sparql: - try: - json1 = json.loads(sparql) - return json1 - except: - print('Error downloading SPARQL? Malformatted JSON? Skiping\n') - return - else: - print('Server return empty file') - return - return - -def genJobDetails(tool='', domainlink='', joburl='', jobdate='', jobsize='', jobobjects='', jobaborted=False, jobproblem=False): - jobdetails = "" - if type(jobsize) is int: - if jobsize < 1024: - jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=%d | {{red|%s}} || data-sort-value=%s | %s' % (tool, domainlink, joburl, jobdate, jobsize, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects) - else: - jobcolor = 'green' - if jobaborted: - jobcolor = 'orange' - if jobproblem: - jobcolor = 'purple' - jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=%d | {{%s|%s}} || data-sort-value=%s | %s' % (tool, domainlink, joburl, jobdate, jobsize, jobcolor, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects) - else: - jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=0 | %s || data-sort-value=%s | %s' % (tool, domainlink, joburl, jobdate, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects) - return jobdetails - -def getArchiveDetailsArchivebot(url='', singleurl=False): - viewerurl = 'https://archive.fart.website/archivebot/viewer/?q=' + url - origdomain = url.split('://')[1].split('/')[0] - origdomain2 = re.sub(r'(?im)^(www\d*)\.', '.', origdomain) - rawdomains = getURL(url=viewerurl, cache=False) - domains = list(set(re.findall(r"(?im)/archivebot/viewer/domain/([^<>\"]+)", rawdomains))) - if not domains: #no results for this url, remove cache - removeFromArchivebotCache(url=viewerurl) - details = [] - totaljobsize = 0 - jobslimit = 10 # before 10000 - tool = '[[ArchiveBot]]' - for domain in domains: - if domain != origdomain and not domain in origdomain and not origdomain2 in domain: - continue - urljobs = "https://archive.fart.website/archivebot/viewer/domain/" + domain - rawjobs = getURL(url=urljobs, cache=False) #false, we want the most recent list of jobs always - - #remove unrelated jobs, for example googlesites, facebook, etc - rawjobs2 = "" - for rawjobcandidate in rawjobs.split(""): - if url.split('://')[1].strip('/') in rawjobcandidate: - rawjobs2 += rawjobcandidate - rawjobs = rawjobs2 - - jobs = re.findall(r"(?im)/archivebot/viewer/job/([^<>\"]+)\"", rawjobs) - print("jobs", jobs) - for jobid in jobs[:jobslimit]: - jobidtruncated = jobid[-5:] - urljob = "https://archive.fart.website/archivebot/viewer/job/" + jobid - print(urljob) - rawjob = getURL(url=urljob, cache=True) - jsonfileurls = re.findall(r'(?im)', rawjob) - for jsonfileurl in jsonfileurls: - print(jsonfileurl) - if singleurl: - jsonraw = getURL(url=jsonfileurl, cache=True) #cache json from internet archive - try: - jsonfileloaded = json.loads(jsonraw) - except: - continue - if not 'url' in jsonfileloaded or ('url' in jsonfileloaded and jsonfileloaded['url'].strip('/') != url.strip('/')): - continue - - jobproblem = False - warcs = re.findall(r"(?im)\" ]+?-(inf|shallow)-(\d{8})-(\d{6})-%s[^<> ]*?\.warc\.gz\">[^<>\"]*?\s*\s*(\d+)" % (jobidtruncated), rawjob) - print(warcs) - if not warcs: - jobproblem = True - jobdatetimes = [] - for warc in warcs: - jobdatetimes.append("%s-%s" % (warc[1], warc[2])) - jobdatetimes = list(set(jobdatetimes)) - jobdatetimes.sort() - for jobdatetime in jobdatetimes: - if not jobdatetime in jsonfileurl: - continue - warcsnometa = len(re.findall(r"(?im)>\s*[^<>\"]+?-(inf|shallow)-(\d{8})-(\d{6})-%s-[^<> ]*?\d+\.warc\.gz" % (jobidtruncated), rawjob)) - inforshallow = list(set(re.findall(r"(?im)>\s*[^<>\"]+?-(inf|shallow)-\d{8}-\d{6}-%s[^<> ]*?\.warc\.gz" % (jobidtruncated), rawjob))) - - print(jobdatetime, warcsnometa, inforshallow) - - inforshallow = len(inforshallow) == 1 and inforshallow[0] or 'unknown' - toolb = "%s%s" % (tool, inforshallow == 'unknown' and '' or " (!%s)" % (inforshallow == 'inf' and 'a' or 'ao')) - jobaborted = False - if ('%s-%s-aborted-' % (jobdatetime, jobid)) in rawjob or ('%s-%s-aborted.json' % (jobdatetime, jobid)) in rawjob: - jobaborted = True - jobdate = '-' in jobdatetime and jobdatetime.split('-')[0] or 'nodate' - jobsize = sum([jobdatetime == '%s-%s' % (warc[1], warc[2]) and int(warc[3]) or 0 for warc in warcs]) - if jobdate and jobdate != 'nodate': - jobdate = '%s-%s-%s' % (jobdate[0:4], jobdate[4:6], jobdate[6:8]) - #jobdetails = genJobDetails(tool=toolb, domainlink="[https://archive.fart.website/archivebot/viewer/domain/%s %s]" % (domain, domain), joburl="[https://archive.fart.website/archivebot/viewer/job/%s %s]" % (jobidtruncated, jobidtruncated), jobdate=jobdate, jobsize=jobsize, jobobjects="%d warcs" % (warcsnometa), jobaborted=jobaborted, jobproblem=jobproblem) - jobdetails = genJobDetails(tool=toolb, domainlink=domain, joburl=jobidtruncated, jobdate=jobdate, jobsize=jobsize, jobobjects="%d warcs" % (warcsnometa), jobaborted=jobaborted, jobproblem=jobproblem) - totaljobsize += jobsize - details.append(jobdetails) - return details, totaljobsize - -def getArchiveDetailsChromebot(url='', singleurl=False): - global ChromebotCache - details = [] - totaljobsize = 0 - if not ChromebotCache: #empty dict - ChromebotCache = loadChromebotCache() - saveChromebotCache() - #{"id": "bajop-tomur-fagok-huzol", "user": "eientei95", "date": "2019-05-21T11:51:09.286515", "warcsize": 2775866, "url": "https://twitter.com/...", "urlcount": 1} - tool = '[[Chromebot]]' - for date, jobs in ChromebotCache.items(): - for job in jobs: - if job['url'] == url or ('urlseed' in job and job['urlseed'] == url): - domain = url.split('://')[1].split('/')[0] - jobid = job['id'].split('-')[-1] # last chunk seems unique - jobdate = '-' - if 'date' in job: - jobdate = job['date'].split('T')[0] - elif 'queued' in job: - jobdate = job['queued'].split('T')[0] - jobsize = '-' - if 'warcsize' in job: - jobsize = int(job['warcsize']) - jobobjects = '1 urls' - if 'urlcount' in job: - jobobjects = "%s urls" % (int(job['urlcount'])) - itemname = job['item'] - jobdetails = genJobDetails(tool=tool, domainlink=domain, joburl="[https://archive.org/download/%s %s]" % (itemname, jobid), jobdate=jobdate, jobsize=jobsize, jobobjects=jobobjects) - totaljobsize += jobsize - details.append(jobdetails) - return details, totaljobsize - -def getArchiveDetailsNarabot(url='', singleurl=False): - global NarabotCache - details = [] - totaljobsize = 0 - if not NarabotCache: #empty dict - NarabotCache = loadNarabotCache() - saveNarabotCache() - tool = '[[Narabot]]' - for itemname, props in NarabotCache.items(): - if props['originalurl'].strip('/').startswith(url.strip('/')): - domain = props['originalurl'].split('://')[1].split('/')[0] - urlfiles = 'https://archive.org/download/%s/%s_files.xml' % (itemname, itemname) - rawfiles = getURL(url=urlfiles, cache=True) - jobid = 'job' - jobdate = itemname.split('_-_')[1].split('_')[0] - jobsize = sum([int(x) for x in re.findall(r'(?im)(\d+)', rawfiles)]) - jobdetails = genJobDetails(tool=tool, domainlink=domain, joburl="[https://archive.org/download/%s %s]" % (itemname, jobid), jobdate=jobdate, jobsize=jobsize, jobobjects="1 repo") - totaljobsize += jobsize - details.append(jobdetails) - return details, totaljobsize - -def getArchiveDetailsWikiteam(url='', singleurl=False): - global WikiteamCache - details = [] - totaljobsize = 0 - if not WikiteamCache: #empty dict - WikiteamCache = loadWikiteamCache() - saveWikiteamCache() - tool = '[[WikiTeam]]' - for itemname, props in WikiteamCache.items(): - itemname_ = re.sub(r'(?im)^wiki-', '', itemname) - if props['originalurl'].strip('/').startswith(url.strip('/')): - #if item files follows wikidump/history filename style, we use every file in item like a different job - #otherwise we count just 1 job and sum all file sizes - domain = props['originalurl'].split('://')[1].split('/')[0] - urlfiles = 'https://archive.org/download/%s/%s_files.xml' % (itemname, itemname) - rawfiles = getURL(url=urlfiles, cache=True) - isstandard = re.search(r'(?im)' % (itemname_), rawfiles) and True or False - if isstandard: - for xfile in rawfiles.split(''): - jobid = 'job' - jobdate = 'date' - jobsize = re.findall(r'(?im)(\d+)', xfile) and int(re.findall(r'(?im)(\d+)', xfile)[0]) or 0 - m = re.findall(r'(?im)' % (itemname_), xfile) - if m: - m = m[0] - jobid = m[3] - jobdate = '%s-%s-%s' % (m[2][0:4], m[2][4:6], m[2][6:8]) - jobdetails = genJobDetails(tool=tool, domainlink=domain, joburl="[https://archive.org/download/%s %s]" % (itemname, jobid), jobdate=jobdate, jobsize=jobsize, jobobjects="1 dump") - totaljobsize += jobsize - details.append(jobdetails) - else: - jobid = 'other' - jobdate = re.findall(r'(?im)(\d+)', rawfiles) and int(re.findall(r'(?im)(\d+)', rawfiles)[0]) or 'date' - if type(jobdate) is int: - jobdate = datetime.datetime.utcfromtimestamp(jobdate).strftime('%Y-%m-%d') - jobsize = sum([int(x) for x in re.findall(r'(?im)(\d+)', rawfiles)]) - jobdetails = genJobDetails(tool=tool, domainlink=domain, joburl="[https://archive.org/download/%s %s]" % (itemname, jobid), jobdate=jobdate, jobsize=jobsize) - totaljobsize += jobsize - details.append(jobdetails) - return details, totaljobsize - -def getArchiveDetailsYoutubearchive(url='', singleurl=False): - global YoutubearchiveCache - details = [] - totaljobsize = 0 - if not YoutubearchiveCache: #empty dict - YoutubearchiveCache = loadYoutubearchiveCache() - saveYoutubearchiveCache() - tool = '[[YouTube|ytarchive]]' - if re.search(r'https://www\.youtube\.com/(channel|user)/[^/]+', url): - domain = url.split('://')[1].split('/')[0] - channelid = url.split('/')[4].split('/')[0] - urlytarchive = 'https://ya.borg.xyz/logs/dl/?C=M;O=D' - rawytarchive = getURL(url=urlytarchive, cache=True) - channels = re.findall(r'(?im)', rawytarchive) - if channelid in channels: - urlytarchive2 = 'https://ya.borg.xyz/logs/dl/%s/?C=M;O=D' % (channelid) - rawytarchive2 = getURL(url=urlytarchive2, cache=True) - logs = re.findall(r'(?im)', rawytarchive2) - if logs: - logfilename = logs[0] - urlytarchive3 = 'https://ya.borg.xyz/logs/dl/%s/%s' % (channelid, logfilename) - rawytarchive3 = getURL(url=urlytarchive3, cache=True) - if re.search(r'Finished downloading playlist', rawytarchive3): - jobid = '-' - jobdate = logfilename.split('T')[0] - jobsize = '-' - jobobjects = '-' - if re.search(r'(?im)Downloading video (\d+) of \1$', rawytarchive3): - numvideos = int(re.findall(r'(?im)Downloading video (\d+) of \1$', rawytarchive3)[0]) - numerrors = re.findall(r'ERROR: ', rawytarchive3) and len(re.findall(r'ERROR: ', rawytarchive3)[0]) or 0 - jobobjects = "%s videos" % (numvideos-numerrors) - jobdetails = genJobDetails(tool=tool, domainlink=domain, joburl="%s" % (jobid), jobdate=jobdate, jobsize=jobsize, jobobjects=jobobjects) - if type(jobsize) is int: - totaljobsize += jobsize - details.append(jobdetails) - return details, totaljobsize - -def getArchiveDetailsCore(url='', singleurl=False): - detailsArchivebot, totaljobsizeArchivebot = getArchiveDetailsArchivebot(url=url, singleurl=singleurl) - """ - deprecated archives - - detailsChromebot, totaljobsizeChromebot = getArchiveDetailsChromebot(url=url, singleurl=singleurl) - detailsNarabot, totaljobsizeNarabot = getArchiveDetailsNarabot(url=url, singleurl=singleurl) - detailsWikiteam, totaljobsizeWikiteam = getArchiveDetailsWikiteam(url=url, singleurl=singleurl) - detailsYoutubearchive, totaljobsizeYoutubearchive = getArchiveDetailsYoutubearchive(url=url, singleurl=singleurl) - """ - - """ - details = detailsArchivebot + detailsChromebot + detailsNarabot + detailsWikiteam + detailsYoutubearchive - totaljobsize = totaljobsizeArchivebot + totaljobsizeChromebot + totaljobsizeNarabot + totaljobsizeWikiteam + totaljobsizeYoutubearchive - """ - - details = detailsArchivebot - totaljobsize = totaljobsizeArchivebot - - details.sort() - detailsplain = '\n|-\n'.join(details) - return detailsplain, totaljobsize - -def getArchiveDetails(url=''): - if url and '://' in url: - if '://archive.org/' in url or \ - '://www.webcitation.org/' in url: - return False, '', 0 - - domain = url.split('://')[1].split('/')[0] - if len(url.split(domain)[1]) > 1: #url is domain.ext/more - details, totaljobsize = getArchiveDetailsCore(url=url, singleurl=True) - return details and True or False, details, totaljobsize - - #url is domain.ext - details, totaljobsize = getArchiveDetailsCore(url=url, singleurl=False) - return details and True or False, details, totaljobsize - - return False, '', 0 diff --git a/wikibot/urlteam-torrents.py b/wikibot/urlteam-torrents.py deleted file mode 100644 index b6b205d..0000000 --- a/wikibot/urlteam-torrents.py +++ /dev/null @@ -1,20 +0,0 @@ -import pywikibot -import requests - -def main(): - site = pywikibot.Site('en', 'ArchiveTeam') - page = pywikibot.Page(site, 'URLTeam/torrents') - - IAResponse = requests.get('https://archive.org/services/search/v1/scrape?q=subject:terroroftinytown&count=10000') - IAItems = IAResponse.json()['items'] - output = [f"https://archive.org/download/{IAItem['identifier']}/{IAItem['identifier']}_archive.torrent" for IAItem in IAItems] - outputStr = '
\n' + '\n'.join(output) + '\n
' - - # Update if necessary - if page.text != outputStr: - site.login() # Only log in when necessary - page.text = outputStr - page.save("Updated torrent list") - - -main() -- cgit v1.3.1-10-gc9f91