From 34df4cd5eb5d945f456a0f4a3584438a5a850976 Mon Sep 17 00:00:00 2001
From: klea
Date: Sun, 17 May 2026 13:28:12 +0000
Subject: wikibot: Move some scripts to wikibot-manual
---
wikibot-manual/archivebot.py | 264 ++++++++++++++++++
wikibot-manual/archiveteamfun.py | 547 +++++++++++++++++++++++++++++++++++++
wikibot-manual/urlteam-torrents.py | 20 ++
wikibot/archivebot.py | 264 ------------------
wikibot/archiveteamfun.py | 547 -------------------------------------
wikibot/urlteam-torrents.py | 20 --
6 files changed, 831 insertions(+), 831 deletions(-)
create mode 100644 wikibot-manual/archivebot.py
create mode 100644 wikibot-manual/archiveteamfun.py
create mode 100644 wikibot-manual/urlteam-torrents.py
delete mode 100644 wikibot/archivebot.py
delete mode 100644 wikibot/archiveteamfun.py
delete mode 100644 wikibot/urlteam-torrents.py
diff --git a/wikibot-manual/archivebot.py b/wikibot-manual/archivebot.py
new file mode 100644
index 0000000..7246a8d
--- /dev/null
+++ b/wikibot-manual/archivebot.py
@@ -0,0 +1,264 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# https://github.com/emijrp/internet-archive/raw/master/archivebot.py
+
+# Copyright (C) 2018-2019 Archive Team
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import collections
+import datetime
+import json
+import re
+import sys
+import time
+import urllib.parse
+import urllib.request
+import pywikibot
+import pywikibot.pagegenerators as pagegenerators
+
+from archiveteamfun import *
+
+Entry = collections.namedtuple('Entry', ('sorturl', 'url', 'label', 'note', 'line'))
+truncationpattern = re.compile(r'^[^:/]+://(www\.)?')
+
+def parselistline(line):
+ label = None
+ note = None
+ if '|' in line:
+ url, rest = line.split('|', 1)
+ args = map(str.strip, rest.split('|'))
+ for position, arg in enumerate(args):
+ if '=' in arg:
+ key, value = map(str.strip, arg.split('=', 1))
+ if key == 'label':
+ label = value
+ continue
+ elif key == 'note':
+ note = value
+ continue
+ # If it's neither, just treat it like it didn't have any '=' to begin with...
+ if position == 0:
+ label = arg
+ elif position == 1:
+ note = arg
+ # Everything else is ignored
+ else:
+ url = line
+ url = url.strip()
+ if '://' in url and not '/' in url.split('://')[1]:
+ url = url + '/'
+ line = url + (' | label = ' + label if label else '') + (' | note = ' + note if note else '')
+ sorturl = truncationpattern.sub('', url).lower()
+ for domain in ('transfer.sh', 'transfer.kiska.pw', 'transfer.notkiska.pw', 'ix.io'):
+ if domain == 'ix.io' and '+' not in sorturl:
+ # Only apply this stripping to the undocumented trick URLs of format ix.io/code+/filename
+ continue
+ if sorturl.startswith(domain) and sum(x == '/' for x in sorturl) == 2:
+ # For file hosting URLs that contain exactly two slashes, strip the first path component = the random file ID to sort by the filename instead.
+ sorturl = domain + sorturl[sorturl.index('/', len(domain) + 1):]
+ return Entry(sorturl = sorturl, url = url, label = label, note = note, line = line)
+
+def curateurls(wlist=''):
+ # Returns a dict of sectionname => list of URLs entries
+ # sectionname is None for URLs outside of a section (i.e. on a page without section or before the first section).
+ # A "URL entry" in the list is an Entry object (namedtuple); the label is None if it isn't present.
+
+ lines = []
+ currentsectionname = None
+ currentsectionentries = []
+ sectionentries = {}
+
+ def endsection():
+ nonlocal currentsectionentries, lines, sectionentries, currentsectionname
+ currentsectionentries = list(set(currentsectionentries)) # Deduplicate
+ currentsectionentries.sort(key = lambda x: (x.sorturl, x.label if x.label is not None else '', x.url, x.note if x.note is not None else '', x.line))
+ lines.extend(x.line for x in currentsectionentries)
+ sectionentries[currentsectionname] = currentsectionentries
+ currentsectionentries = []
+
+ for line in wlist.text.strip().splitlines():
+ if line.strip().startswith('='):
+ # New section, sort and append previous section
+ endsection()
+ currentsectionname = line.strip().strip('=').strip()
+ if currentsectionname in sectionentries:
+ print('Warning: duplicate section name {!r} on page {}'.format(currentsectionname, wlist.title()))
+ if lines:
+ lines.append('')
+ lines.append(line.strip())
+ elif line.strip():
+ currentsectionentries.append(parselistline(line))
+ endsection()
+
+ lines = '\n'.join(lines)
+ if wlist.text != lines:
+ wlist.text = lines
+ wlist.save("BOT - Sorting list")
+
+ return sectionentries
+
+def main():
+ atsite = pywikibot.Site('en', 'ArchiveTeam')
+ cat = pywikibot.Category(atsite, "Category:ArchiveBot")
+ gen = pagegenerators.CategorizedPageGenerator(cat, start="!")
+ pre = pagegenerators.PreloadingGenerator(gen)
+ listlenlimit = 1000
+ for page in pre:
+ wtitle = page.title()
+ wtext = page.text
+
+ if len(sys.argv)>1 and not sys.argv[1] in wtitle:
+ continue
+
+ if not wtitle.startswith('ArchiveBot/'):
+ continue
+ wlist = pywikibot.Page(atsite, '%s/list' % (wtitle))
+ if not wlist.exists():
+ print("Page %s/list doesnt exist" % (wtitle))
+ continue
+ sectionentries = curateurls(wlist=wlist)
+
+ print('\n===', wtitle, '===')
+ if (not '' in wtext and not '' in wtext:
+ print("No tag. Skiping...")
+ continue
+ if len(wlist.text.splitlines()) > listlenlimit:
+ continue
+
+ newtext = []
+ totaljobsize = 0
+ totalsaved = 0
+ totalnotsaved = 0
+
+ # Find blocks of page text that end with a bot tag
+ blocks = wtext.split('')
+
+ # The last block must be tag-free, so only iterate over the previous ones
+ for block in blocks[:-1]:
+ # Find beginning of bot tag
+ pos = block.find('')
+ if pos == -1:
+ pos = block.find('')
+ continue
+
+ if block[pos:].startswith(''):
+ # Sectionless tag, use section None
+ section = None
+ openingtag = ''
+ elif block[pos:].startswith('', pos)
+ if openend == -1:
+ print("Block's opening tag does not have an end, skipping...")
+ newtext.append(block)
+ newtext.append('')
+ continue
+ section = block[pos + 9:openend].strip() # 9 = len('')
+ continue
+
+ if section not in sectionentries:
+ print('Block references section {!r} which does not exist, skipping...'.format(section))
+ newtext.append(block)
+ newtext.append('')
+ continue
+
+ # Add prefixed text (if any)
+ newtext.append(block[:pos])
+
+ # Add opening tag (as it was before)
+ newtext.append(openingtag)
+
+ # Generate table
+ c = 1
+ rowsplain = ""
+ sectionjobsize = 0
+ sectionhasnotes = any(entry.note is not None for entry in sectionentries[section])
+ for entry in sectionentries[section]:
+ viewerplain = ''
+ viewerdetailsplain = ''
+ viewer = [getArchiveDetails(url=entry.url)]
+ if viewer[0][0]:
+ viewerplain = "{{saved}}"
+ viewerdetailsplain = viewer[0][1]
+ sectionjobsize += viewer[0][2]
+ else:
+ viewerplain = "{{notsaved}}"
+ viewerdetailsplain = ''
+ rowspan = len(re.findall(r'\|-', viewerdetailsplain))+1
+ rowspanplain = 'rowspan=%d | ' % (rowspan) if rowspan>1 else ''
+ if entry.label:
+ urllabel = '{{URLAB|1=%s|2=%s}}' % (entry.url, entry.label)
+ else:
+ urllabel = '{{URLAB|1=%s}}' % (entry.url)
+ if sectionhasnotes:
+ notescolumn = '%s%s || ' % (rowspanplain, entry.note if entry.note is not None else '')
+ else:
+ notescolumn = ''
+ rowsplain += "\n|-\n| %s%s || %s%s%s\n%s " % (rowspanplain, urllabel, notescolumn, rowspanplain, viewerplain, viewerdetailsplain if viewerdetailsplain else '| || || || || || ')
+ c += 1
+
+ totaljobsize += sectionjobsize
+ sectionsaved = rowsplain.count('{{saved}}')
+ totalsaved += sectionsaved
+ sectionnotsaved = rowsplain.count('{{notsaved}}')
+ totalnotsaved += sectionnotsaved
+ notesheader = 'rowspan=2 | Notes !! ' if sectionhasnotes else ''
+ output = """
+* '''Statistics''': {{saved}} (%s){{·}} {{notsaved}} (%s){{·}} Total size (%s)
+
+Do not edit this table, it is automatically updated by bot. There is a [[{{FULLPAGENAME}}/list|raw list]] of URLs that you can edit.
+
+{| class="wikitable sortable plainlinks"
+! rowspan=2 | Website !! %srowspan=2 | Status !! colspan=6 | Archive details
+|-
+! Tool !! Domain !! Job !! Date !! Size !! Objects %s
+|}
+""" % (sectionsaved, sectionnotsaved, convertsize(b=sectionjobsize), notesheader, rowsplain)
+ newtext.append(output)
+
+ newtext.append('')
+
+ # Add the last, tag-free block
+ newtext.append(blocks[-1])
+
+ newtext = ''.join(newtext)
+
+ # Replace total statistics if necessary
+ if '' in newtext:
+ newtext = re.sub(r'.*?', "'''Statistics''': {{saved}} (%s)){{·}} {{notsaved}} (%s){{·}} Total size (%s)" % (totalsaved, totalnotsaved, convertsize(b = totaljobsize)), newtext)
+
+ if wtext != newtext:
+ pywikibot.showDiff(wtext, newtext)
+ page.text = newtext
+ try:
+ page.save("BOT - Updating page: {{saved}} (%s), {{notsaved}} (%s), Total size (%s)" % (totalsaved, totalnotsaved, convertsize(b=totaljobsize)))
+ except:
+ print("Error while saving...")
+ else:
+ print("No changes needed in", page.title())
+
+ cleanArchiveBotCache()
+
+if __name__ == '__main__':
+ main()
diff --git a/wikibot-manual/archiveteamfun.py b/wikibot-manual/archiveteamfun.py
new file mode 100644
index 0000000..2d5ff54
--- /dev/null
+++ b/wikibot-manual/archiveteamfun.py
@@ -0,0 +1,547 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# https://github.com/emijrp/internet-archive/raw/master/archiveteamfun.py
+
+# Copyright (C) 2018-2019 Archive Team
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import datetime
+import gzip
+import json
+import os
+import pickle
+import random
+import re
+import sys
+import _thread
+import time
+import unicodedata
+import urllib
+import urllib.request
+import urllib.parse
+
+ArchivebotCache = {}
+ChromebotCache = {}
+NarabotCache = {}
+WikiteamCache = {}
+YoutubearchiveCache = {}
+
+# Ideas:
+# Mirrortube (no channel info in metadata :()
+# Videobot
+#
+# Fix:
+# https://archiveteam.org/index.php?title=ArchiveBot/2018_Brazilian_general_elections (portal.imprensanacional.gov.br no json = no saved?)
+#
+# Error no json:
+"""Retry in 20 seconds...
+Retrieving: https://archive.fart.website/archivebot/viewer/?q=https://transfer.notkiska.pw/kqFhq/twitter-@mattiastesfaye
+Retrieving: https://archive.org/download/archiveteam_archivebot_go_20190514190001/urls-transfer.notkiska.pw-berries.space-accounts-09-May-2019-inf-20190511-012325-8grwh.json
+"""
+
+def convertsize(b=0): #bytes
+ if type(b) is int:
+ if b < 1024: #<1KiB
+ return '0 KiB'
+ elif b < 1024*1024: #<1MiB
+ return '%d KiB' % (b/(1024))
+ elif b < 1024*1024*1024: #<1GiB
+ return '%d MiB' % (b/(1024*1024))
+ elif b < 1024*1024*1024*1024: #<1TiB
+ return '%.1f GiB' % (b/(1024.0*1024*1024))
+ elif b < 1024*1024*1024*1024*1024: #<1PiB
+ return '%.1f TiB' % (b/(1024.0*1024*1024*1024))
+ elif b < 1024*1024*1024*1024*1024*1024: #<1EiB
+ return '%.1f PiB' % (b/(1024.0*1024*1024*1024*1024))
+ else:
+ return b
+
+def loadArchivebotCache():
+ c = {}
+ if os.path.exists('archivebot.cache'):
+ with open('archivebot.cache', 'rb') as f:
+ c = pickle.load(f)
+ return c.copy()
+
+def removeFromArchivebotCache(url='', save=True):
+ global ArchivebotCache
+ if url and url in ArchivebotCache:
+ del ArchivebotCache[url]
+ if save:
+ saveArchivebotCache()
+
+def saveArchivebotCache():
+ global ArchivebotCache
+ with open('archivebot.cache', 'wb') as f:
+ pickle.dump(ArchivebotCache, f)
+
+def cleanArchiveBotCache():
+ global ArchivebotCache
+ ArchivebotCache2 = ArchivebotCache.copy()
+
+ for url, raw in ArchivebotCache2.items():
+ #remove from cache urls without results
+ #we need to check for results in the next run
+ if url.startswith("https://archive.fart.website/archivebot/viewer/?q="):
+ if re.search(r'(?im)No search results.', raw):
+ removeFromArchivebotCache(url=url, save=False)
+
+ #remove from cache domains with many jobs (FB, TW, etc)
+ #these result pages change frequently
+ if url.startswith("https://archive.fart.website/archivebot/viewer/domain/"):
+ domain = url.split("https://archive.fart.website/archivebot/viewer/domain/")[1]
+ jobs = re.findall(r"(?im)/archivebot/viewer/job/([^<>\"]+)", raw)
+ if len(jobs) >= 10:
+ removeFromArchivebotCache(url=url, save=False)
+
+ #remove from cache jobs with problems or in progress
+ #we need to check wether problems were solved in the next run
+ if url.startswith("https://archive.fart.website/archivebot/viewer/job/"):
+ job = url.split("https://archive.fart.website/archivebot/viewer/job/")[1]
+ jsonfileurls = re.findall(r'(?im) ]+\.json)">', raw)
+ if not jsonfileurls and re.search(r'-%s\d{4}-\d{6}-' % (datetime.datetime.today().year), raw): #job in progress
+ removeFromArchivebotCache(url=url, save=False)
+ warcs = re.findall(r"(?im)>\s*[^<>\"]+?-(\d{8})-(\d{6})-%s[^<> ]*?\.warc\.gz\s*\s*\s*
(\d+) | " % (job), raw)
+ if not warcs and re.search(r'-%s\d{4}-\d{6}-' % (datetime.datetime.today().year), raw): #job in progress
+ removeFromArchivebotCache(url=url, save=False)
+
+ if 'borg.xyz/logs/' in url and not '.log' in url:
+ removeFromArchivebotCache(url=url, save=False)
+
+ saveArchivebotCache()
+
+def loadChromebotCache():
+ c = {}
+ if os.path.exists('chromebot.cache'):
+ with open('chromebot.cache', 'rb') as f:
+ c = pickle.load(f)
+ firstcached = datetime.datetime(2019, 5, 7)
+ today = datetime.datetime.today()
+ iaquery = 'https://archive.org/advancedsearch.php?q=chromebot&fl[]=identifier&sort[]=publicdate+desc&sort[]=&sort[]=&rows=5000000&page=1&output=json'
+ raw = getURL(url=iaquery, cache=False)
+ json1 = json.loads(raw)
+ for item in json1["response"]["docs"]:
+ itemname = item['identifier']
+ if not re.search(r'chromebot-\d\d\d\d-\d\d-\d\d-', itemname):
+ continue
+ itemdate = itemname.split('chromebot-')[1][:10]
+ itemdate = datetime.datetime(int(itemdate.split('-')[0]), int(itemdate.split('-')[1]), int(itemdate.split('-')[2]))
+ if itemdate >= firstcached and itemdate <= today:
+ if not itemdate.isoformat() in c:
+ c[itemdate.isoformat()] = []
+ urlitem = 'https://archive.org/download/%s' % (itemname)
+ raw2 = getURL(url=urlitem, cache=False)
+ print('Loading .json for', item, itemdate)
+ urljson = ''
+ if '"jobs.json"' in raw2:
+ urljson = 'https://archive.org/download/%s/jobs.json' % (itemname)
+ elif '"jobs.json.gz"' in raw2:
+ urljson = 'https://archive.org/download/%s/jobs.json.gz' % (itemname)
+ if urljson:
+ raw3 = getURL(url=urljson, cache=False)
+ for line in raw3.splitlines():
+ if line.startswith('{"id":'):
+ json2 = json.loads(line)
+ json2['item'] = itemname
+ c[itemdate.isoformat()].append(json2)
+ #print(c[itemdate.isoformat()][-1]['id'])
+ return c.copy()
+
+def saveChromebotCache():
+ global ChromebotCache
+ with open('chromebot.cache', 'wb') as f:
+ pickle.dump(ChromebotCache, f)
+
+def loadNarabotCache():
+ c = {}
+ if os.path.exists('narabot.cache'):
+ with open('narabot.cache', 'rb') as f:
+ c = pickle.load(f)
+ iaquery = 'https://archive.org/advancedsearch.php?q=collection%3Agithub_narabot_mirror&fl[]=identifier&fl[]=originalurl&sort[]=&sort[]=&sort[]=&rows=5000000&page=1&output=json'
+ raw = getURL(url=iaquery, cache=False)
+ json1 = json.loads(raw)
+ for item in json1["response"]["docs"]:
+ if not 'originalurl' in item:
+ continue
+ itemname = item['identifier']
+ originalurl = item['originalurl']
+ if type(originalurl) is list:
+ originalurl = originalurl[0]
+ c[itemname] = { 'originalurl': originalurl }
+ return c.copy()
+
+def saveNarabotCache():
+ global NarabotCache
+ with open('narabot.cache', 'wb') as f:
+ pickle.dump(NarabotCache, f)
+
+def loadWikiteamCache():
+ c = {}
+ if os.path.exists('wikiteam.cache'):
+ with open('wikiteam.cache', 'rb') as f:
+ c = pickle.load(f)
+ iaquery = 'https://archive.org/advancedsearch.php?q=collection%3Awikiteam&fl[]=identifier&fl[]=originalurl&sort[]=&sort[]=&sort[]=&rows=5000000&page=1&output=json'
+ raw = getURL(url=iaquery, cache=False)
+ json1 = json.loads(raw)
+ for item in json1["response"]["docs"]:
+ if not 'originalurl' in item:
+ continue
+ itemname = item['identifier']
+ originalurl = item['originalurl']
+ if type(originalurl) is list:
+ originalurl = originalurl[0]
+ #if not itemname.startswith('wiki-'):
+ # continue
+ c[itemname] = { 'originalurl': originalurl }
+ return c.copy()
+
+def saveWikiteamCache():
+ global WikiteamCache
+ with open('wikiteam.cache', 'wb') as f:
+ pickle.dump(WikiteamCache, f)
+
+def loadYoutubearchiveCache():
+ pass
+
+def saveYoutubearchiveCache():
+ pass
+
+def getURL(url='', cache=False, retry=True):
+ global ArchivebotCache
+
+ if '8grwh' in url: #deleted jobs/jsons
+ return ''
+
+ if cache: #do not download if it is cached
+ if not ArchivebotCache: #empty dict
+ ArchivebotCache = loadArchivebotCache()
+ if url:
+ if url in ArchivebotCache:
+ #print("Using cached page for %s" % (url))
+ return ArchivebotCache[url]
+ raw = ''
+ headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0' }
+ request = urllib.request.Request(url, headers=headers)
+ try:
+ print("Retrieving: %s" % (url))
+ response = urllib.request.urlopen(request)
+ if url.endswith('.gz'):
+ gzipFile = gzip.GzipFile(fileobj=response)
+ raw = gzipFile.read().strip().decode('utf-8')
+ else:
+ raw = response.read().strip().decode('utf-8')
+ if cache: #refresh cache
+ ArchivebotCache[url] = raw
+ if not random.randint(0, 100):
+ saveArchivebotCache()
+ except:
+ if url.endswith('.json'): #some .json are deleted on IA
+ return ''
+
+ sleep = 10 # seconds
+ maxsleep = 30
+ while retry and sleep <= maxsleep:
+ print('Error while retrieving: %s' % (url))
+ print('Retry in %s seconds...' % (sleep))
+ time.sleep(sleep)
+ try:
+ response = urllib.request.urlopen(request)
+ if url.endswith('.gz'):
+ gzipFile = gzip.GzipFile(fileobj=response)
+ raw = gzipFile.read().strip().decode('utf-8')
+ else:
+ raw = response.read().strip().decode('utf-8')
+ if cache: #refresh cache
+ ArchivebotCache[url] = raw
+ except:
+ pass
+ sleep = sleep * 2
+ return raw
+
+def loadSPARQL(sparql=''):
+ json1 = ''
+ if sparql:
+ try:
+ json1 = json.loads(sparql)
+ return json1
+ except:
+ print('Error downloading SPARQL? Malformatted JSON? Skiping\n')
+ return
+ else:
+ print('Server return empty file')
+ return
+ return
+
+def genJobDetails(tool='', domainlink='', joburl='', jobdate='', jobsize='', jobobjects='', jobaborted=False, jobproblem=False):
+ jobdetails = ""
+ if type(jobsize) is int:
+ if jobsize < 1024:
+ jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=%d | {{red|%s}} || data-sort-value=%s | %s' % (tool, domainlink, joburl, jobdate, jobsize, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects)
+ else:
+ jobcolor = 'green'
+ if jobaborted:
+ jobcolor = 'orange'
+ if jobproblem:
+ jobcolor = 'purple'
+ jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=%d | {{%s|%s}} || data-sort-value=%s | %s' % (tool, domainlink, joburl, jobdate, jobsize, jobcolor, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects)
+ else:
+ jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=0 | %s || data-sort-value=%s | %s' % (tool, domainlink, joburl, jobdate, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects)
+ return jobdetails
+
+def getArchiveDetailsArchivebot(url='', singleurl=False):
+ viewerurl = 'https://archive.fart.website/archivebot/viewer/?q=' + url
+ origdomain = url.split('://')[1].split('/')[0]
+ origdomain2 = re.sub(r'(?im)^(www\d*)\.', '.', origdomain)
+ rawdomains = getURL(url=viewerurl, cache=False)
+ domains = list(set(re.findall(r"(?im)/archivebot/viewer/domain/([^<>\"]+)", rawdomains)))
+ if not domains: #no results for this url, remove cache
+ removeFromArchivebotCache(url=viewerurl)
+ details = []
+ totaljobsize = 0
+ jobslimit = 10 # before 10000
+ tool = '[[ArchiveBot]]'
+ for domain in domains:
+ if domain != origdomain and not domain in origdomain and not origdomain2 in domain:
+ continue
+ urljobs = "https://archive.fart.website/archivebot/viewer/domain/" + domain
+ rawjobs = getURL(url=urljobs, cache=False) #false, we want the most recent list of jobs always
+
+ #remove unrelated jobs, for example googlesites, facebook, etc
+ rawjobs2 = ""
+ for rawjobcandidate in rawjobs.split(""):
+ if url.split('://')[1].strip('/') in rawjobcandidate:
+ rawjobs2 += rawjobcandidate
+ rawjobs = rawjobs2
+
+ jobs = re.findall(r"(?im)/archivebot/viewer/job/([^<>\"]+)\"", rawjobs)
+ print("jobs", jobs)
+ for jobid in jobs[:jobslimit]:
+ jobidtruncated = jobid[-5:]
+ urljob = "https://archive.fart.website/archivebot/viewer/job/" + jobid
+ print(urljob)
+ rawjob = getURL(url=urljob, cache=True)
+ jsonfileurls = re.findall(r'(?im)', rawjob)
+ for jsonfileurl in jsonfileurls:
+ print(jsonfileurl)
+ if singleurl:
+ jsonraw = getURL(url=jsonfileurl, cache=True) #cache json from internet archive
+ try:
+ jsonfileloaded = json.loads(jsonraw)
+ except:
+ continue
+ if not 'url' in jsonfileloaded or ('url' in jsonfileloaded and jsonfileloaded['url'].strip('/') != url.strip('/')):
+ continue
+
+ jobproblem = False
+ warcs = re.findall(r"(?im)\" ]+?-(inf|shallow)-(\d{8})-(\d{6})-%s[^<> ]*?\.warc\.gz\">[^<>\"]*?\s*\s*| (\d+) | " % (jobidtruncated), rawjob)
+ print(warcs)
+ if not warcs:
+ jobproblem = True
+ jobdatetimes = []
+ for warc in warcs:
+ jobdatetimes.append("%s-%s" % (warc[1], warc[2]))
+ jobdatetimes = list(set(jobdatetimes))
+ jobdatetimes.sort()
+ for jobdatetime in jobdatetimes:
+ if not jobdatetime in jsonfileurl:
+ continue
+ warcsnometa = len(re.findall(r"(?im)>\s*[^<>\"]+?-(inf|shallow)-(\d{8})-(\d{6})-%s-[^<> ]*?\d+\.warc\.gz" % (jobidtruncated), rawjob))
+ inforshallow = list(set(re.findall(r"(?im)>\s*[^<>\"]+?-(inf|shallow)-\d{8}-\d{6}-%s[^<> ]*?\.warc\.gz" % (jobidtruncated), rawjob)))
+
+ print(jobdatetime, warcsnometa, inforshallow)
+
+ inforshallow = len(inforshallow) == 1 and inforshallow[0] or 'unknown'
+ toolb = "%s%s" % (tool, inforshallow == 'unknown' and '' or " (!%s)" % (inforshallow == 'inf' and 'a' or 'ao'))
+ jobaborted = False
+ if ('%s-%s-aborted-' % (jobdatetime, jobid)) in rawjob or ('%s-%s-aborted.json' % (jobdatetime, jobid)) in rawjob:
+ jobaborted = True
+ jobdate = '-' in jobdatetime and jobdatetime.split('-')[0] or 'nodate'
+ jobsize = sum([jobdatetime == '%s-%s' % (warc[1], warc[2]) and int(warc[3]) or 0 for warc in warcs])
+ if jobdate and jobdate != 'nodate':
+ jobdate = '%s-%s-%s' % (jobdate[0:4], jobdate[4:6], jobdate[6:8])
+ #jobdetails = genJobDetails(tool=toolb, domainlink="[https://archive.fart.website/archivebot/viewer/domain/%s %s]" % (domain, domain), joburl="[https://archive.fart.website/archivebot/viewer/job/%s %s]" % (jobidtruncated, jobidtruncated), jobdate=jobdate, jobsize=jobsize, jobobjects="%d warcs" % (warcsnometa), jobaborted=jobaborted, jobproblem=jobproblem)
+ jobdetails = genJobDetails(tool=toolb, domainlink=domain, joburl=jobidtruncated, jobdate=jobdate, jobsize=jobsize, jobobjects="%d warcs" % (warcsnometa), jobaborted=jobaborted, jobproblem=jobproblem)
+ totaljobsize += jobsize
+ details.append(jobdetails)
+ return details, totaljobsize
+
+def getArchiveDetailsChromebot(url='', singleurl=False):
+ global ChromebotCache
+ details = []
+ totaljobsize = 0
+ if not ChromebotCache: #empty dict
+ ChromebotCache = loadChromebotCache()
+ saveChromebotCache()
+ #{"id": "bajop-tomur-fagok-huzol", "user": "eientei95", "date": "2019-05-21T11:51:09.286515", "warcsize": 2775866, "url": "https://twitter.com/...", "urlcount": 1}
+ tool = '[[Chromebot]]'
+ for date, jobs in ChromebotCache.items():
+ for job in jobs:
+ if job['url'] == url or ('urlseed' in job and job['urlseed'] == url):
+ domain = url.split('://')[1].split('/')[0]
+ jobid = job['id'].split('-')[-1] # last chunk seems unique
+ jobdate = '-'
+ if 'date' in job:
+ jobdate = job['date'].split('T')[0]
+ elif 'queued' in job:
+ jobdate = job['queued'].split('T')[0]
+ jobsize = '-'
+ if 'warcsize' in job:
+ jobsize = int(job['warcsize'])
+ jobobjects = '1 urls'
+ if 'urlcount' in job:
+ jobobjects = "%s urls" % (int(job['urlcount']))
+ itemname = job['item']
+ jobdetails = genJobDetails(tool=tool, domainlink=domain, joburl="[https://archive.org/download/%s %s]" % (itemname, jobid), jobdate=jobdate, jobsize=jobsize, jobobjects=jobobjects)
+ totaljobsize += jobsize
+ details.append(jobdetails)
+ return details, totaljobsize
+
+def getArchiveDetailsNarabot(url='', singleurl=False):
+ global NarabotCache
+ details = []
+ totaljobsize = 0
+ if not NarabotCache: #empty dict
+ NarabotCache = loadNarabotCache()
+ saveNarabotCache()
+ tool = '[[Narabot]]'
+ for itemname, props in NarabotCache.items():
+ if props['originalurl'].strip('/').startswith(url.strip('/')):
+ domain = props['originalurl'].split('://')[1].split('/')[0]
+ urlfiles = 'https://archive.org/download/%s/%s_files.xml' % (itemname, itemname)
+ rawfiles = getURL(url=urlfiles, cache=True)
+ jobid = 'job'
+ jobdate = itemname.split('_-_')[1].split('_')[0]
+ jobsize = sum([int(x) for x in re.findall(r'(?im)(\d+)', rawfiles)])
+ jobdetails = genJobDetails(tool=tool, domainlink=domain, joburl="[https://archive.org/download/%s %s]" % (itemname, jobid), jobdate=jobdate, jobsize=jobsize, jobobjects="1 repo")
+ totaljobsize += jobsize
+ details.append(jobdetails)
+ return details, totaljobsize
+
+def getArchiveDetailsWikiteam(url='', singleurl=False):
+ global WikiteamCache
+ details = []
+ totaljobsize = 0
+ if not WikiteamCache: #empty dict
+ WikiteamCache = loadWikiteamCache()
+ saveWikiteamCache()
+ tool = '[[WikiTeam]]'
+ for itemname, props in WikiteamCache.items():
+ itemname_ = re.sub(r'(?im)^wiki-', '', itemname)
+ if props['originalurl'].strip('/').startswith(url.strip('/')):
+ #if item files follows wikidump/history filename style, we use every file in item like a different job
+ #otherwise we count just 1 job and sum all file sizes
+ domain = props['originalurl'].split('://')[1].split('/')[0]
+ urlfiles = 'https://archive.org/download/%s/%s_files.xml' % (itemname, itemname)
+ rawfiles = getURL(url=urlfiles, cache=True)
+ isstandard = re.search(r'(?im)' % (itemname_), rawfiles) and True or False
+ if isstandard:
+ for xfile in rawfiles.split(''):
+ jobid = 'job'
+ jobdate = 'date'
+ jobsize = re.findall(r'(?im)(\d+)', xfile) and int(re.findall(r'(?im)(\d+)', xfile)[0]) or 0
+ m = re.findall(r'(?im)' % (itemname_), xfile)
+ if m:
+ m = m[0]
+ jobid = m[3]
+ jobdate = '%s-%s-%s' % (m[2][0:4], m[2][4:6], m[2][6:8])
+ jobdetails = genJobDetails(tool=tool, domainlink=domain, joburl="[https://archive.org/download/%s %s]" % (itemname, jobid), jobdate=jobdate, jobsize=jobsize, jobobjects="1 dump")
+ totaljobsize += jobsize
+ details.append(jobdetails)
+ else:
+ jobid = 'other'
+ jobdate = re.findall(r'(?im)(\d+)', rawfiles) and int(re.findall(r'(?im)(\d+)', rawfiles)[0]) or 'date'
+ if type(jobdate) is int:
+ jobdate = datetime.datetime.utcfromtimestamp(jobdate).strftime('%Y-%m-%d')
+ jobsize = sum([int(x) for x in re.findall(r'(?im)(\d+)', rawfiles)])
+ jobdetails = genJobDetails(tool=tool, domainlink=domain, joburl="[https://archive.org/download/%s %s]" % (itemname, jobid), jobdate=jobdate, jobsize=jobsize)
+ totaljobsize += jobsize
+ details.append(jobdetails)
+ return details, totaljobsize
+
+def getArchiveDetailsYoutubearchive(url='', singleurl=False):
+ global YoutubearchiveCache
+ details = []
+ totaljobsize = 0
+ if not YoutubearchiveCache: #empty dict
+ YoutubearchiveCache = loadYoutubearchiveCache()
+ saveYoutubearchiveCache()
+ tool = '[[YouTube|ytarchive]]'
+ if re.search(r'https://www\.youtube\.com/(channel|user)/[^/]+', url):
+ domain = url.split('://')[1].split('/')[0]
+ channelid = url.split('/')[4].split('/')[0]
+ urlytarchive = 'https://ya.borg.xyz/logs/dl/?C=M;O=D'
+ rawytarchive = getURL(url=urlytarchive, cache=True)
+ channels = re.findall(r'(?im)', rawytarchive)
+ if channelid in channels:
+ urlytarchive2 = 'https://ya.borg.xyz/logs/dl/%s/?C=M;O=D' % (channelid)
+ rawytarchive2 = getURL(url=urlytarchive2, cache=True)
+ logs = re.findall(r'(?im)', rawytarchive2)
+ if logs:
+ logfilename = logs[0]
+ urlytarchive3 = 'https://ya.borg.xyz/logs/dl/%s/%s' % (channelid, logfilename)
+ rawytarchive3 = getURL(url=urlytarchive3, cache=True)
+ if re.search(r'Finished downloading playlist', rawytarchive3):
+ jobid = '-'
+ jobdate = logfilename.split('T')[0]
+ jobsize = '-'
+ jobobjects = '-'
+ if re.search(r'(?im)Downloading video (\d+) of \1$', rawytarchive3):
+ numvideos = int(re.findall(r'(?im)Downloading video (\d+) of \1$', rawytarchive3)[0])
+ numerrors = re.findall(r'ERROR: ', rawytarchive3) and len(re.findall(r'ERROR: ', rawytarchive3)[0]) or 0
+ jobobjects = "%s videos" % (numvideos-numerrors)
+ jobdetails = genJobDetails(tool=tool, domainlink=domain, joburl="%s" % (jobid), jobdate=jobdate, jobsize=jobsize, jobobjects=jobobjects)
+ if type(jobsize) is int:
+ totaljobsize += jobsize
+ details.append(jobdetails)
+ return details, totaljobsize
+
+def getArchiveDetailsCore(url='', singleurl=False):
+ detailsArchivebot, totaljobsizeArchivebot = getArchiveDetailsArchivebot(url=url, singleurl=singleurl)
+ """
+ deprecated archives
+
+ detailsChromebot, totaljobsizeChromebot = getArchiveDetailsChromebot(url=url, singleurl=singleurl)
+ detailsNarabot, totaljobsizeNarabot = getArchiveDetailsNarabot(url=url, singleurl=singleurl)
+ detailsWikiteam, totaljobsizeWikiteam = getArchiveDetailsWikiteam(url=url, singleurl=singleurl)
+ detailsYoutubearchive, totaljobsizeYoutubearchive = getArchiveDetailsYoutubearchive(url=url, singleurl=singleurl)
+ """
+
+ """
+ details = detailsArchivebot + detailsChromebot + detailsNarabot + detailsWikiteam + detailsYoutubearchive
+ totaljobsize = totaljobsizeArchivebot + totaljobsizeChromebot + totaljobsizeNarabot + totaljobsizeWikiteam + totaljobsizeYoutubearchive
+ """
+
+ details = detailsArchivebot
+ totaljobsize = totaljobsizeArchivebot
+
+ details.sort()
+ detailsplain = '\n|-\n'.join(details)
+ return detailsplain, totaljobsize
+
+def getArchiveDetails(url=''):
+ if url and '://' in url:
+ if '://archive.org/' in url or \
+ '://www.webcitation.org/' in url:
+ return False, '', 0
+
+ domain = url.split('://')[1].split('/')[0]
+ if len(url.split(domain)[1]) > 1: #url is domain.ext/more
+ details, totaljobsize = getArchiveDetailsCore(url=url, singleurl=True)
+ return details and True or False, details, totaljobsize
+
+ #url is domain.ext
+ details, totaljobsize = getArchiveDetailsCore(url=url, singleurl=False)
+ return details and True or False, details, totaljobsize
+
+ return False, '', 0
diff --git a/wikibot-manual/urlteam-torrents.py b/wikibot-manual/urlteam-torrents.py
new file mode 100644
index 0000000..b6b205d
--- /dev/null
+++ b/wikibot-manual/urlteam-torrents.py
@@ -0,0 +1,20 @@
+import pywikibot
+import requests
+
+def main():
+ site = pywikibot.Site('en', 'ArchiveTeam')
+ page = pywikibot.Page(site, 'URLTeam/torrents')
+
+ IAResponse = requests.get('https://archive.org/services/search/v1/scrape?q=subject:terroroftinytown&count=10000')
+ IAItems = IAResponse.json()['items']
+ output = [f"https://archive.org/download/{IAItem['identifier']}/{IAItem['identifier']}_archive.torrent" for IAItem in IAItems]
+ outputStr = '\n' + '\n'.join(output) + '\n
'
+
+ # Update if necessary
+ if page.text != outputStr:
+ site.login() # Only log in when necessary
+ page.text = outputStr
+ page.save("Updated torrent list")
+
+
+main()
diff --git a/wikibot/archivebot.py b/wikibot/archivebot.py
deleted file mode 100644
index 7246a8d..0000000
--- a/wikibot/archivebot.py
+++ /dev/null
@@ -1,264 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-# https://github.com/emijrp/internet-archive/raw/master/archivebot.py
-
-# Copyright (C) 2018-2019 Archive Team
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see .
-
-import collections
-import datetime
-import json
-import re
-import sys
-import time
-import urllib.parse
-import urllib.request
-import pywikibot
-import pywikibot.pagegenerators as pagegenerators
-
-from archiveteamfun import *
-
-Entry = collections.namedtuple('Entry', ('sorturl', 'url', 'label', 'note', 'line'))
-truncationpattern = re.compile(r'^[^:/]+://(www\.)?')
-
-def parselistline(line):
- label = None
- note = None
- if '|' in line:
- url, rest = line.split('|', 1)
- args = map(str.strip, rest.split('|'))
- for position, arg in enumerate(args):
- if '=' in arg:
- key, value = map(str.strip, arg.split('=', 1))
- if key == 'label':
- label = value
- continue
- elif key == 'note':
- note = value
- continue
- # If it's neither, just treat it like it didn't have any '=' to begin with...
- if position == 0:
- label = arg
- elif position == 1:
- note = arg
- # Everything else is ignored
- else:
- url = line
- url = url.strip()
- if '://' in url and not '/' in url.split('://')[1]:
- url = url + '/'
- line = url + (' | label = ' + label if label else '') + (' | note = ' + note if note else '')
- sorturl = truncationpattern.sub('', url).lower()
- for domain in ('transfer.sh', 'transfer.kiska.pw', 'transfer.notkiska.pw', 'ix.io'):
- if domain == 'ix.io' and '+' not in sorturl:
- # Only apply this stripping to the undocumented trick URLs of format ix.io/code+/filename
- continue
- if sorturl.startswith(domain) and sum(x == '/' for x in sorturl) == 2:
- # For file hosting URLs that contain exactly two slashes, strip the first path component = the random file ID to sort by the filename instead.
- sorturl = domain + sorturl[sorturl.index('/', len(domain) + 1):]
- return Entry(sorturl = sorturl, url = url, label = label, note = note, line = line)
-
-def curateurls(wlist=''):
- # Returns a dict of sectionname => list of URLs entries
- # sectionname is None for URLs outside of a section (i.e. on a page without section or before the first section).
- # A "URL entry" in the list is an Entry object (namedtuple); the label is None if it isn't present.
-
- lines = []
- currentsectionname = None
- currentsectionentries = []
- sectionentries = {}
-
- def endsection():
- nonlocal currentsectionentries, lines, sectionentries, currentsectionname
- currentsectionentries = list(set(currentsectionentries)) # Deduplicate
- currentsectionentries.sort(key = lambda x: (x.sorturl, x.label if x.label is not None else '', x.url, x.note if x.note is not None else '', x.line))
- lines.extend(x.line for x in currentsectionentries)
- sectionentries[currentsectionname] = currentsectionentries
- currentsectionentries = []
-
- for line in wlist.text.strip().splitlines():
- if line.strip().startswith('='):
- # New section, sort and append previous section
- endsection()
- currentsectionname = line.strip().strip('=').strip()
- if currentsectionname in sectionentries:
- print('Warning: duplicate section name {!r} on page {}'.format(currentsectionname, wlist.title()))
- if lines:
- lines.append('')
- lines.append(line.strip())
- elif line.strip():
- currentsectionentries.append(parselistline(line))
- endsection()
-
- lines = '\n'.join(lines)
- if wlist.text != lines:
- wlist.text = lines
- wlist.save("BOT - Sorting list")
-
- return sectionentries
-
-def main():
- atsite = pywikibot.Site('en', 'ArchiveTeam')
- cat = pywikibot.Category(atsite, "Category:ArchiveBot")
- gen = pagegenerators.CategorizedPageGenerator(cat, start="!")
- pre = pagegenerators.PreloadingGenerator(gen)
- listlenlimit = 1000
- for page in pre:
- wtitle = page.title()
- wtext = page.text
-
- if len(sys.argv)>1 and not sys.argv[1] in wtitle:
- continue
-
- if not wtitle.startswith('ArchiveBot/'):
- continue
- wlist = pywikibot.Page(atsite, '%s/list' % (wtitle))
- if not wlist.exists():
- print("Page %s/list doesnt exist" % (wtitle))
- continue
- sectionentries = curateurls(wlist=wlist)
-
- print('\n===', wtitle, '===')
- if (not '' in wtext and not '' in wtext:
- print("No tag. Skiping...")
- continue
- if len(wlist.text.splitlines()) > listlenlimit:
- continue
-
- newtext = []
- totaljobsize = 0
- totalsaved = 0
- totalnotsaved = 0
-
- # Find blocks of page text that end with a bot tag
- blocks = wtext.split('')
-
- # The last block must be tag-free, so only iterate over the previous ones
- for block in blocks[:-1]:
- # Find beginning of bot tag
- pos = block.find('')
- if pos == -1:
- pos = block.find('')
- continue
-
- if block[pos:].startswith(''):
- # Sectionless tag, use section None
- section = None
- openingtag = ''
- elif block[pos:].startswith('', pos)
- if openend == -1:
- print("Block's opening tag does not have an end, skipping...")
- newtext.append(block)
- newtext.append('')
- continue
- section = block[pos + 9:openend].strip() # 9 = len('')
- continue
-
- if section not in sectionentries:
- print('Block references section {!r} which does not exist, skipping...'.format(section))
- newtext.append(block)
- newtext.append('')
- continue
-
- # Add prefixed text (if any)
- newtext.append(block[:pos])
-
- # Add opening tag (as it was before)
- newtext.append(openingtag)
-
- # Generate table
- c = 1
- rowsplain = ""
- sectionjobsize = 0
- sectionhasnotes = any(entry.note is not None for entry in sectionentries[section])
- for entry in sectionentries[section]:
- viewerplain = ''
- viewerdetailsplain = ''
- viewer = [getArchiveDetails(url=entry.url)]
- if viewer[0][0]:
- viewerplain = "{{saved}}"
- viewerdetailsplain = viewer[0][1]
- sectionjobsize += viewer[0][2]
- else:
- viewerplain = "{{notsaved}}"
- viewerdetailsplain = ''
- rowspan = len(re.findall(r'\|-', viewerdetailsplain))+1
- rowspanplain = 'rowspan=%d | ' % (rowspan) if rowspan>1 else ''
- if entry.label:
- urllabel = '{{URLAB|1=%s|2=%s}}' % (entry.url, entry.label)
- else:
- urllabel = '{{URLAB|1=%s}}' % (entry.url)
- if sectionhasnotes:
- notescolumn = '%s%s || ' % (rowspanplain, entry.note if entry.note is not None else '')
- else:
- notescolumn = ''
- rowsplain += "\n|-\n| %s%s || %s%s%s\n%s " % (rowspanplain, urllabel, notescolumn, rowspanplain, viewerplain, viewerdetailsplain if viewerdetailsplain else '| || || || || || ')
- c += 1
-
- totaljobsize += sectionjobsize
- sectionsaved = rowsplain.count('{{saved}}')
- totalsaved += sectionsaved
- sectionnotsaved = rowsplain.count('{{notsaved}}')
- totalnotsaved += sectionnotsaved
- notesheader = 'rowspan=2 | Notes !! ' if sectionhasnotes else ''
- output = """
-* '''Statistics''': {{saved}} (%s){{·}} {{notsaved}} (%s){{·}} Total size (%s)
-
-Do not edit this table, it is automatically updated by bot. There is a [[{{FULLPAGENAME}}/list|raw list]] of URLs that you can edit.
-
-{| class="wikitable sortable plainlinks"
-! rowspan=2 | Website !! %srowspan=2 | Status !! colspan=6 | Archive details
-|-
-! Tool !! Domain !! Job !! Date !! Size !! Objects %s
-|}
-""" % (sectionsaved, sectionnotsaved, convertsize(b=sectionjobsize), notesheader, rowsplain)
- newtext.append(output)
-
- newtext.append('')
-
- # Add the last, tag-free block
- newtext.append(blocks[-1])
-
- newtext = ''.join(newtext)
-
- # Replace total statistics if necessary
- if '' in newtext:
- newtext = re.sub(r'.*?', "'''Statistics''': {{saved}} (%s)){{·}} {{notsaved}} (%s){{·}} Total size (%s)" % (totalsaved, totalnotsaved, convertsize(b = totaljobsize)), newtext)
-
- if wtext != newtext:
- pywikibot.showDiff(wtext, newtext)
- page.text = newtext
- try:
- page.save("BOT - Updating page: {{saved}} (%s), {{notsaved}} (%s), Total size (%s)" % (totalsaved, totalnotsaved, convertsize(b=totaljobsize)))
- except:
- print("Error while saving...")
- else:
- print("No changes needed in", page.title())
-
- cleanArchiveBotCache()
-
-if __name__ == '__main__':
- main()
diff --git a/wikibot/archiveteamfun.py b/wikibot/archiveteamfun.py
deleted file mode 100644
index 2d5ff54..0000000
--- a/wikibot/archiveteamfun.py
+++ /dev/null
@@ -1,547 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# https://github.com/emijrp/internet-archive/raw/master/archiveteamfun.py
-
-# Copyright (C) 2018-2019 Archive Team
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see .
-
-import datetime
-import gzip
-import json
-import os
-import pickle
-import random
-import re
-import sys
-import _thread
-import time
-import unicodedata
-import urllib
-import urllib.request
-import urllib.parse
-
-ArchivebotCache = {}
-ChromebotCache = {}
-NarabotCache = {}
-WikiteamCache = {}
-YoutubearchiveCache = {}
-
-# Ideas:
-# Mirrortube (no channel info in metadata :()
-# Videobot
-#
-# Fix:
-# https://archiveteam.org/index.php?title=ArchiveBot/2018_Brazilian_general_elections (portal.imprensanacional.gov.br no json = no saved?)
-#
-# Error no json:
-"""Retry in 20 seconds...
-Retrieving: https://archive.fart.website/archivebot/viewer/?q=https://transfer.notkiska.pw/kqFhq/twitter-@mattiastesfaye
-Retrieving: https://archive.org/download/archiveteam_archivebot_go_20190514190001/urls-transfer.notkiska.pw-berries.space-accounts-09-May-2019-inf-20190511-012325-8grwh.json
-"""
-
-def convertsize(b=0): #bytes
- if type(b) is int:
- if b < 1024: #<1KiB
- return '0 KiB'
- elif b < 1024*1024: #<1MiB
- return '%d KiB' % (b/(1024))
- elif b < 1024*1024*1024: #<1GiB
- return '%d MiB' % (b/(1024*1024))
- elif b < 1024*1024*1024*1024: #<1TiB
- return '%.1f GiB' % (b/(1024.0*1024*1024))
- elif b < 1024*1024*1024*1024*1024: #<1PiB
- return '%.1f TiB' % (b/(1024.0*1024*1024*1024))
- elif b < 1024*1024*1024*1024*1024*1024: #<1EiB
- return '%.1f PiB' % (b/(1024.0*1024*1024*1024*1024))
- else:
- return b
-
-def loadArchivebotCache():
- c = {}
- if os.path.exists('archivebot.cache'):
- with open('archivebot.cache', 'rb') as f:
- c = pickle.load(f)
- return c.copy()
-
-def removeFromArchivebotCache(url='', save=True):
- global ArchivebotCache
- if url and url in ArchivebotCache:
- del ArchivebotCache[url]
- if save:
- saveArchivebotCache()
-
-def saveArchivebotCache():
- global ArchivebotCache
- with open('archivebot.cache', 'wb') as f:
- pickle.dump(ArchivebotCache, f)
-
-def cleanArchiveBotCache():
- global ArchivebotCache
- ArchivebotCache2 = ArchivebotCache.copy()
-
- for url, raw in ArchivebotCache2.items():
- #remove from cache urls without results
- #we need to check for results in the next run
- if url.startswith("https://archive.fart.website/archivebot/viewer/?q="):
- if re.search(r'(?im)No search results.', raw):
- removeFromArchivebotCache(url=url, save=False)
-
- #remove from cache domains with many jobs (FB, TW, etc)
- #these result pages change frequently
- if url.startswith("https://archive.fart.website/archivebot/viewer/domain/"):
- domain = url.split("https://archive.fart.website/archivebot/viewer/domain/")[1]
- jobs = re.findall(r"(?im)/archivebot/viewer/job/([^<>\"]+)", raw)
- if len(jobs) >= 10:
- removeFromArchivebotCache(url=url, save=False)
-
- #remove from cache jobs with problems or in progress
- #we need to check wether problems were solved in the next run
- if url.startswith("https://archive.fart.website/archivebot/viewer/job/"):
- job = url.split("https://archive.fart.website/archivebot/viewer/job/")[1]
- jsonfileurls = re.findall(r'(?im) ]+\.json)">', raw)
- if not jsonfileurls and re.search(r'-%s\d{4}-\d{6}-' % (datetime.datetime.today().year), raw): #job in progress
- removeFromArchivebotCache(url=url, save=False)
- warcs = re.findall(r"(?im)>\s*[^<>\"]+?-(\d{8})-(\d{6})-%s[^<> ]*?\.warc\.gz\s*\s*\s*(\d+) | " % (job), raw)
- if not warcs and re.search(r'-%s\d{4}-\d{6}-' % (datetime.datetime.today().year), raw): #job in progress
- removeFromArchivebotCache(url=url, save=False)
-
- if 'borg.xyz/logs/' in url and not '.log' in url:
- removeFromArchivebotCache(url=url, save=False)
-
- saveArchivebotCache()
-
-def loadChromebotCache():
- c = {}
- if os.path.exists('chromebot.cache'):
- with open('chromebot.cache', 'rb') as f:
- c = pickle.load(f)
- firstcached = datetime.datetime(2019, 5, 7)
- today = datetime.datetime.today()
- iaquery = 'https://archive.org/advancedsearch.php?q=chromebot&fl[]=identifier&sort[]=publicdate+desc&sort[]=&sort[]=&rows=5000000&page=1&output=json'
- raw = getURL(url=iaquery, cache=False)
- json1 = json.loads(raw)
- for item in json1["response"]["docs"]:
- itemname = item['identifier']
- if not re.search(r'chromebot-\d\d\d\d-\d\d-\d\d-', itemname):
- continue
- itemdate = itemname.split('chromebot-')[1][:10]
- itemdate = datetime.datetime(int(itemdate.split('-')[0]), int(itemdate.split('-')[1]), int(itemdate.split('-')[2]))
- if itemdate >= firstcached and itemdate <= today:
- if not itemdate.isoformat() in c:
- c[itemdate.isoformat()] = []
- urlitem = 'https://archive.org/download/%s' % (itemname)
- raw2 = getURL(url=urlitem, cache=False)
- print('Loading .json for', item, itemdate)
- urljson = ''
- if '"jobs.json"' in raw2:
- urljson = 'https://archive.org/download/%s/jobs.json' % (itemname)
- elif '"jobs.json.gz"' in raw2:
- urljson = 'https://archive.org/download/%s/jobs.json.gz' % (itemname)
- if urljson:
- raw3 = getURL(url=urljson, cache=False)
- for line in raw3.splitlines():
- if line.startswith('{"id":'):
- json2 = json.loads(line)
- json2['item'] = itemname
- c[itemdate.isoformat()].append(json2)
- #print(c[itemdate.isoformat()][-1]['id'])
- return c.copy()
-
-def saveChromebotCache():
- global ChromebotCache
- with open('chromebot.cache', 'wb') as f:
- pickle.dump(ChromebotCache, f)
-
-def loadNarabotCache():
- c = {}
- if os.path.exists('narabot.cache'):
- with open('narabot.cache', 'rb') as f:
- c = pickle.load(f)
- iaquery = 'https://archive.org/advancedsearch.php?q=collection%3Agithub_narabot_mirror&fl[]=identifier&fl[]=originalurl&sort[]=&sort[]=&sort[]=&rows=5000000&page=1&output=json'
- raw = getURL(url=iaquery, cache=False)
- json1 = json.loads(raw)
- for item in json1["response"]["docs"]:
- if not 'originalurl' in item:
- continue
- itemname = item['identifier']
- originalurl = item['originalurl']
- if type(originalurl) is list:
- originalurl = originalurl[0]
- c[itemname] = { 'originalurl': originalurl }
- return c.copy()
-
-def saveNarabotCache():
- global NarabotCache
- with open('narabot.cache', 'wb') as f:
- pickle.dump(NarabotCache, f)
-
-def loadWikiteamCache():
- c = {}
- if os.path.exists('wikiteam.cache'):
- with open('wikiteam.cache', 'rb') as f:
- c = pickle.load(f)
- iaquery = 'https://archive.org/advancedsearch.php?q=collection%3Awikiteam&fl[]=identifier&fl[]=originalurl&sort[]=&sort[]=&sort[]=&rows=5000000&page=1&output=json'
- raw = getURL(url=iaquery, cache=False)
- json1 = json.loads(raw)
- for item in json1["response"]["docs"]:
- if not 'originalurl' in item:
- continue
- itemname = item['identifier']
- originalurl = item['originalurl']
- if type(originalurl) is list:
- originalurl = originalurl[0]
- #if not itemname.startswith('wiki-'):
- # continue
- c[itemname] = { 'originalurl': originalurl }
- return c.copy()
-
-def saveWikiteamCache():
- global WikiteamCache
- with open('wikiteam.cache', 'wb') as f:
- pickle.dump(WikiteamCache, f)
-
-def loadYoutubearchiveCache():
- pass
-
-def saveYoutubearchiveCache():
- pass
-
-def getURL(url='', cache=False, retry=True):
- global ArchivebotCache
-
- if '8grwh' in url: #deleted jobs/jsons
- return ''
-
- if cache: #do not download if it is cached
- if not ArchivebotCache: #empty dict
- ArchivebotCache = loadArchivebotCache()
- if url:
- if url in ArchivebotCache:
- #print("Using cached page for %s" % (url))
- return ArchivebotCache[url]
- raw = ''
- headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0' }
- request = urllib.request.Request(url, headers=headers)
- try:
- print("Retrieving: %s" % (url))
- response = urllib.request.urlopen(request)
- if url.endswith('.gz'):
- gzipFile = gzip.GzipFile(fileobj=response)
- raw = gzipFile.read().strip().decode('utf-8')
- else:
- raw = response.read().strip().decode('utf-8')
- if cache: #refresh cache
- ArchivebotCache[url] = raw
- if not random.randint(0, 100):
- saveArchivebotCache()
- except:
- if url.endswith('.json'): #some .json are deleted on IA
- return ''
-
- sleep = 10 # seconds
- maxsleep = 30
- while retry and sleep <= maxsleep:
- print('Error while retrieving: %s' % (url))
- print('Retry in %s seconds...' % (sleep))
- time.sleep(sleep)
- try:
- response = urllib.request.urlopen(request)
- if url.endswith('.gz'):
- gzipFile = gzip.GzipFile(fileobj=response)
- raw = gzipFile.read().strip().decode('utf-8')
- else:
- raw = response.read().strip().decode('utf-8')
- if cache: #refresh cache
- ArchivebotCache[url] = raw
- except:
- pass
- sleep = sleep * 2
- return raw
-
-def loadSPARQL(sparql=''):
- json1 = ''
- if sparql:
- try:
- json1 = json.loads(sparql)
- return json1
- except:
- print('Error downloading SPARQL? Malformatted JSON? Skiping\n')
- return
- else:
- print('Server return empty file')
- return
- return
-
-def genJobDetails(tool='', domainlink='', joburl='', jobdate='', jobsize='', jobobjects='', jobaborted=False, jobproblem=False):
- jobdetails = ""
- if type(jobsize) is int:
- if jobsize < 1024:
- jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=%d | {{red|%s}} || data-sort-value=%s | %s' % (tool, domainlink, joburl, jobdate, jobsize, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects)
- else:
- jobcolor = 'green'
- if jobaborted:
- jobcolor = 'orange'
- if jobproblem:
- jobcolor = 'purple'
- jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=%d | {{%s|%s}} || data-sort-value=%s | %s' % (tool, domainlink, joburl, jobdate, jobsize, jobcolor, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects)
- else:
- jobdetails = '| style="white-space: nowrap;" | %s || %s || %s || %s || data-sort-value=0 | %s || data-sort-value=%s | %s' % (tool, domainlink, joburl, jobdate, convertsize(b=jobsize), jobobjects.split(' ')[0], jobobjects)
- return jobdetails
-
-def getArchiveDetailsArchivebot(url='', singleurl=False):
- viewerurl = 'https://archive.fart.website/archivebot/viewer/?q=' + url
- origdomain = url.split('://')[1].split('/')[0]
- origdomain2 = re.sub(r'(?im)^(www\d*)\.', '.', origdomain)
- rawdomains = getURL(url=viewerurl, cache=False)
- domains = list(set(re.findall(r"(?im)/archivebot/viewer/domain/([^<>\"]+)", rawdomains)))
- if not domains: #no results for this url, remove cache
- removeFromArchivebotCache(url=viewerurl)
- details = []
- totaljobsize = 0
- jobslimit = 10 # before 10000
- tool = '[[ArchiveBot]]'
- for domain in domains:
- if domain != origdomain and not domain in origdomain and not origdomain2 in domain:
- continue
- urljobs = "https://archive.fart.website/archivebot/viewer/domain/" + domain
- rawjobs = getURL(url=urljobs, cache=False) #false, we want the most recent list of jobs always
-
- #remove unrelated jobs, for example googlesites, facebook, etc
- rawjobs2 = ""
- for rawjobcandidate in rawjobs.split(""):
- if url.split('://')[1].strip('/') in rawjobcandidate:
- rawjobs2 += rawjobcandidate
- rawjobs = rawjobs2
-
- jobs = re.findall(r"(?im)/archivebot/viewer/job/([^<>\"]+)\"", rawjobs)
- print("jobs", jobs)
- for jobid in jobs[:jobslimit]:
- jobidtruncated = jobid[-5:]
- urljob = "https://archive.fart.website/archivebot/viewer/job/" + jobid
- print(urljob)
- rawjob = getURL(url=urljob, cache=True)
- jsonfileurls = re.findall(r'(?im)', rawjob)
- for jsonfileurl in jsonfileurls:
- print(jsonfileurl)
- if singleurl:
- jsonraw = getURL(url=jsonfileurl, cache=True) #cache json from internet archive
- try:
- jsonfileloaded = json.loads(jsonraw)
- except:
- continue
- if not 'url' in jsonfileloaded or ('url' in jsonfileloaded and jsonfileloaded['url'].strip('/') != url.strip('/')):
- continue
-
- jobproblem = False
- warcs = re.findall(r"(?im)\" ]+?-(inf|shallow)-(\d{8})-(\d{6})-%s[^<> ]*?\.warc\.gz\">[^<>\"]*?\s*\s*| (\d+) | " % (jobidtruncated), rawjob)
- print(warcs)
- if not warcs:
- jobproblem = True
- jobdatetimes = []
- for warc in warcs:
- jobdatetimes.append("%s-%s" % (warc[1], warc[2]))
- jobdatetimes = list(set(jobdatetimes))
- jobdatetimes.sort()
- for jobdatetime in jobdatetimes:
- if not jobdatetime in jsonfileurl:
- continue
- warcsnometa = len(re.findall(r"(?im)>\s*[^<>\"]+?-(inf|shallow)-(\d{8})-(\d{6})-%s-[^<> ]*?\d+\.warc\.gz" % (jobidtruncated), rawjob))
- inforshallow = list(set(re.findall(r"(?im)>\s*[^<>\"]+?-(inf|shallow)-\d{8}-\d{6}-%s[^<> ]*?\.warc\.gz" % (jobidtruncated), rawjob)))
-
- print(jobdatetime, warcsnometa, inforshallow)
-
- inforshallow = len(inforshallow) == 1 and inforshallow[0] or 'unknown'
- toolb = "%s%s" % (tool, inforshallow == 'unknown' and '' or " (!%s)" % (inforshallow == 'inf' and 'a' or 'ao'))
- jobaborted = False
- if ('%s-%s-aborted-' % (jobdatetime, jobid)) in rawjob or ('%s-%s-aborted.json' % (jobdatetime, jobid)) in rawjob:
- jobaborted = True
- jobdate = '-' in jobdatetime and jobdatetime.split('-')[0] or 'nodate'
- jobsize = sum([jobdatetime == '%s-%s' % (warc[1], warc[2]) and int(warc[3]) or 0 for warc in warcs])
- if jobdate and jobdate != 'nodate':
- jobdate = '%s-%s-%s' % (jobdate[0:4], jobdate[4:6], jobdate[6:8])
- #jobdetails = genJobDetails(tool=toolb, domainlink="[https://archive.fart.website/archivebot/viewer/domain/%s %s]" % (domain, domain), joburl="[https://archive.fart.website/archivebot/viewer/job/%s %s]" % (jobidtruncated, jobidtruncated), jobdate=jobdate, jobsize=jobsize, jobobjects="%d warcs" % (warcsnometa), jobaborted=jobaborted, jobproblem=jobproblem)
- jobdetails = genJobDetails(tool=toolb, domainlink=domain, joburl=jobidtruncated, jobdate=jobdate, jobsize=jobsize, jobobjects="%d warcs" % (warcsnometa), jobaborted=jobaborted, jobproblem=jobproblem)
- totaljobsize += jobsize
- details.append(jobdetails)
- return details, totaljobsize
-
-def getArchiveDetailsChromebot(url='', singleurl=False):
- global ChromebotCache
- details = []
- totaljobsize = 0
- if not ChromebotCache: #empty dict
- ChromebotCache = loadChromebotCache()
- saveChromebotCache()
- #{"id": "bajop-tomur-fagok-huzol", "user": "eientei95", "date": "2019-05-21T11:51:09.286515", "warcsize": 2775866, "url": "https://twitter.com/...", "urlcount": 1}
- tool = '[[Chromebot]]'
- for date, jobs in ChromebotCache.items():
- for job in jobs:
- if job['url'] == url or ('urlseed' in job and job['urlseed'] == url):
- domain = url.split('://')[1].split('/')[0]
- jobid = job['id'].split('-')[-1] # last chunk seems unique
- jobdate = '-'
- if 'date' in job:
- jobdate = job['date'].split('T')[0]
- elif 'queued' in job:
- jobdate = job['queued'].split('T')[0]
- jobsize = '-'
- if 'warcsize' in job:
- jobsize = int(job['warcsize'])
- jobobjects = '1 urls'
- if 'urlcount' in job:
- jobobjects = "%s urls" % (int(job['urlcount']))
- itemname = job['item']
- jobdetails = genJobDetails(tool=tool, domainlink=domain, joburl="[https://archive.org/download/%s %s]" % (itemname, jobid), jobdate=jobdate, jobsize=jobsize, jobobjects=jobobjects)
- totaljobsize += jobsize
- details.append(jobdetails)
- return details, totaljobsize
-
-def getArchiveDetailsNarabot(url='', singleurl=False):
- global NarabotCache
- details = []
- totaljobsize = 0
- if not NarabotCache: #empty dict
- NarabotCache = loadNarabotCache()
- saveNarabotCache()
- tool = '[[Narabot]]'
- for itemname, props in NarabotCache.items():
- if props['originalurl'].strip('/').startswith(url.strip('/')):
- domain = props['originalurl'].split('://')[1].split('/')[0]
- urlfiles = 'https://archive.org/download/%s/%s_files.xml' % (itemname, itemname)
- rawfiles = getURL(url=urlfiles, cache=True)
- jobid = 'job'
- jobdate = itemname.split('_-_')[1].split('_')[0]
- jobsize = sum([int(x) for x in re.findall(r'(?im)(\d+)', rawfiles)])
- jobdetails = genJobDetails(tool=tool, domainlink=domain, joburl="[https://archive.org/download/%s %s]" % (itemname, jobid), jobdate=jobdate, jobsize=jobsize, jobobjects="1 repo")
- totaljobsize += jobsize
- details.append(jobdetails)
- return details, totaljobsize
-
-def getArchiveDetailsWikiteam(url='', singleurl=False):
- global WikiteamCache
- details = []
- totaljobsize = 0
- if not WikiteamCache: #empty dict
- WikiteamCache = loadWikiteamCache()
- saveWikiteamCache()
- tool = '[[WikiTeam]]'
- for itemname, props in WikiteamCache.items():
- itemname_ = re.sub(r'(?im)^wiki-', '', itemname)
- if props['originalurl'].strip('/').startswith(url.strip('/')):
- #if item files follows wikidump/history filename style, we use every file in item like a different job
- #otherwise we count just 1 job and sum all file sizes
- domain = props['originalurl'].split('://')[1].split('/')[0]
- urlfiles = 'https://archive.org/download/%s/%s_files.xml' % (itemname, itemname)
- rawfiles = getURL(url=urlfiles, cache=True)
- isstandard = re.search(r'(?im)' % (itemname_), rawfiles) and True or False
- if isstandard:
- for xfile in rawfiles.split(''):
- jobid = 'job'
- jobdate = 'date'
- jobsize = re.findall(r'(?im)(\d+)', xfile) and int(re.findall(r'(?im)(\d+)', xfile)[0]) or 0
- m = re.findall(r'(?im)' % (itemname_), xfile)
- if m:
- m = m[0]
- jobid = m[3]
- jobdate = '%s-%s-%s' % (m[2][0:4], m[2][4:6], m[2][6:8])
- jobdetails = genJobDetails(tool=tool, domainlink=domain, joburl="[https://archive.org/download/%s %s]" % (itemname, jobid), jobdate=jobdate, jobsize=jobsize, jobobjects="1 dump")
- totaljobsize += jobsize
- details.append(jobdetails)
- else:
- jobid = 'other'
- jobdate = re.findall(r'(?im)(\d+)', rawfiles) and int(re.findall(r'(?im)(\d+)', rawfiles)[0]) or 'date'
- if type(jobdate) is int:
- jobdate = datetime.datetime.utcfromtimestamp(jobdate).strftime('%Y-%m-%d')
- jobsize = sum([int(x) for x in re.findall(r'(?im)(\d+)', rawfiles)])
- jobdetails = genJobDetails(tool=tool, domainlink=domain, joburl="[https://archive.org/download/%s %s]" % (itemname, jobid), jobdate=jobdate, jobsize=jobsize)
- totaljobsize += jobsize
- details.append(jobdetails)
- return details, totaljobsize
-
-def getArchiveDetailsYoutubearchive(url='', singleurl=False):
- global YoutubearchiveCache
- details = []
- totaljobsize = 0
- if not YoutubearchiveCache: #empty dict
- YoutubearchiveCache = loadYoutubearchiveCache()
- saveYoutubearchiveCache()
- tool = '[[YouTube|ytarchive]]'
- if re.search(r'https://www\.youtube\.com/(channel|user)/[^/]+', url):
- domain = url.split('://')[1].split('/')[0]
- channelid = url.split('/')[4].split('/')[0]
- urlytarchive = 'https://ya.borg.xyz/logs/dl/?C=M;O=D'
- rawytarchive = getURL(url=urlytarchive, cache=True)
- channels = re.findall(r'(?im)', rawytarchive)
- if channelid in channels:
- urlytarchive2 = 'https://ya.borg.xyz/logs/dl/%s/?C=M;O=D' % (channelid)
- rawytarchive2 = getURL(url=urlytarchive2, cache=True)
- logs = re.findall(r'(?im)', rawytarchive2)
- if logs:
- logfilename = logs[0]
- urlytarchive3 = 'https://ya.borg.xyz/logs/dl/%s/%s' % (channelid, logfilename)
- rawytarchive3 = getURL(url=urlytarchive3, cache=True)
- if re.search(r'Finished downloading playlist', rawytarchive3):
- jobid = '-'
- jobdate = logfilename.split('T')[0]
- jobsize = '-'
- jobobjects = '-'
- if re.search(r'(?im)Downloading video (\d+) of \1$', rawytarchive3):
- numvideos = int(re.findall(r'(?im)Downloading video (\d+) of \1$', rawytarchive3)[0])
- numerrors = re.findall(r'ERROR: ', rawytarchive3) and len(re.findall(r'ERROR: ', rawytarchive3)[0]) or 0
- jobobjects = "%s videos" % (numvideos-numerrors)
- jobdetails = genJobDetails(tool=tool, domainlink=domain, joburl="%s" % (jobid), jobdate=jobdate, jobsize=jobsize, jobobjects=jobobjects)
- if type(jobsize) is int:
- totaljobsize += jobsize
- details.append(jobdetails)
- return details, totaljobsize
-
-def getArchiveDetailsCore(url='', singleurl=False):
- detailsArchivebot, totaljobsizeArchivebot = getArchiveDetailsArchivebot(url=url, singleurl=singleurl)
- """
- deprecated archives
-
- detailsChromebot, totaljobsizeChromebot = getArchiveDetailsChromebot(url=url, singleurl=singleurl)
- detailsNarabot, totaljobsizeNarabot = getArchiveDetailsNarabot(url=url, singleurl=singleurl)
- detailsWikiteam, totaljobsizeWikiteam = getArchiveDetailsWikiteam(url=url, singleurl=singleurl)
- detailsYoutubearchive, totaljobsizeYoutubearchive = getArchiveDetailsYoutubearchive(url=url, singleurl=singleurl)
- """
-
- """
- details = detailsArchivebot + detailsChromebot + detailsNarabot + detailsWikiteam + detailsYoutubearchive
- totaljobsize = totaljobsizeArchivebot + totaljobsizeChromebot + totaljobsizeNarabot + totaljobsizeWikiteam + totaljobsizeYoutubearchive
- """
-
- details = detailsArchivebot
- totaljobsize = totaljobsizeArchivebot
-
- details.sort()
- detailsplain = '\n|-\n'.join(details)
- return detailsplain, totaljobsize
-
-def getArchiveDetails(url=''):
- if url and '://' in url:
- if '://archive.org/' in url or \
- '://www.webcitation.org/' in url:
- return False, '', 0
-
- domain = url.split('://')[1].split('/')[0]
- if len(url.split(domain)[1]) > 1: #url is domain.ext/more
- details, totaljobsize = getArchiveDetailsCore(url=url, singleurl=True)
- return details and True or False, details, totaljobsize
-
- #url is domain.ext
- details, totaljobsize = getArchiveDetailsCore(url=url, singleurl=False)
- return details and True or False, details, totaljobsize
-
- return False, '', 0
diff --git a/wikibot/urlteam-torrents.py b/wikibot/urlteam-torrents.py
deleted file mode 100644
index b6b205d..0000000
--- a/wikibot/urlteam-torrents.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import pywikibot
-import requests
-
-def main():
- site = pywikibot.Site('en', 'ArchiveTeam')
- page = pywikibot.Page(site, 'URLTeam/torrents')
-
- IAResponse = requests.get('https://archive.org/services/search/v1/scrape?q=subject:terroroftinytown&count=10000')
- IAItems = IAResponse.json()['items']
- output = [f"https://archive.org/download/{IAItem['identifier']}/{IAItem['identifier']}_archive.torrent" for IAItem in IAItems]
- outputStr = '\n' + '\n'.join(output) + '\n
'
-
- # Update if necessary
- if page.text != outputStr:
- site.login() # Only log in when necessary
- page.text = outputStr
- page.save("Updated torrent list")
-
-
-main()
--
cgit v1.3.1-10-gc9f91