#!/usr/bin/env python # -*- coding: utf-8 -*- # https://github.com/emijrp/internet-archive/raw/master/archivebot.py # Copyright (C) 2018-2019 Archive Team # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import collections import datetime import json import re import sys import time import urllib.parse import urllib.request import pywikibot import pywikibot.pagegenerators as pagegenerators import archiveteamfun #{{ stubbed out changes call, only valid if the page is only edited by the bot. import atexit BATCH_SIZE = 50 pending_changes = {} _original_save = pywikibot.Page.save site = pywikibot.Site() def commit_pending_changes(): global pending_changes if not pending_changes: return print('[Wiki] Submitting %s changes.' % (len(pending_changes))) for title, data in pending_changes.items(): page = pywikibot.Page(site, title) if data['id'] != page.latest_revision_id: continue # If the page was edited in the mean time, don't update it. # (The bot reorders /list pages, but that can happen latter, I'd rather we not lose URLs the bot doesn't know about.) page.text = data['text'] _original_save(page, data['summary']) print("[Wiki] Page [[%(title)s{title}]] updated with summary: %(summary)s" % {'title': title, 'summary': data['summary']}) pending_changes.clear() def stub_save(self, summary = ""): global pending_changes pending_changes[self.title()] = {'text': self.text, 'summary': summary, 'id': self.latest_revision_id} print("[Wiki Stub] Saved [[%(page)s]] locally with summary: %(summary)s" % {'page': self.title(), 'summary': summary}) if len(pending_changes) >= BATCH_SIZE: commit_pending_changes() atexit.register(commit_pending_changes) pywikibot.Page.save = stub_save #}} Entry = collections.namedtuple('Entry', ('sorturl', 'url', 'label', 'note', 'line')) truncationpattern = re.compile(r'^[^:/]+://(www\.)?') def parselistline(line): label = None note = None if '|' in line: url, rest = line.split('|', 1) args = map(str.strip, rest.split('|')) for position, arg in enumerate(args): if '=' in arg: key, value = map(str.strip, arg.split('=', 1)) if key == 'label': label = value continue elif key == 'note': note = value continue # If it's neither, just treat it like it didn't have any '=' to begin with... if position == 0: label = arg elif position == 1: note = arg # Everything else is ignored else: url = line url = url.strip() if '://' in url and not '/' in url.split('://')[1]: url = url + '/' line = url + (' | label = ' + label if label else '') + (' | note = ' + note if note else '') sorturl = truncationpattern.sub('', url).lower() for domain in ('transfer.sh', 'transfer.kiska.pw', 'transfer.archivete.am', 'transfer.notkiska.pw', 'ix.io'): if domain == 'ix.io' and '+' not in sorturl: # Only apply this stripping to the undocumented trick URLs of format ix.io/code+/filename continue if sorturl.startswith(domain) and sum(x == '/' for x in sorturl) == 2: # For file hosting URLs that contain exactly two slashes, strip the first path component = the random file ID to sort by the filename instead. sorturl = domain + sorturl[sorturl.index('/', len(domain) + 1):] return Entry(sorturl = sorturl, url = url, label = label, note = note, line = line) def curateurls(wlist=''): # Returns a dict of sectionname => list of URLs entries # sectionname is None for URLs outside of a section (i.e. on a page without section or before the first section). # A "URL entry" in the list is an Entry object (namedtuple); the label is None if it isn't present. lines = [] currentsectionname = None currentsectionentries = [] sectionentries = {} def endsection(): nonlocal currentsectionentries, lines, sectionentries, currentsectionname currentsectionentries = list(set(currentsectionentries)) # Deduplicate currentsectionentries.sort(key = lambda x: (x.sorturl, x.label if x.label is not None else '', x.url, x.note if x.note is not None else '', x.line)) lines.extend(x.line for x in currentsectionentries) sectionentries[currentsectionname] = currentsectionentries currentsectionentries = [] for line in wlist.text.strip().splitlines(): if line.strip().startswith('='): # New section, sort and append previous section endsection() currentsectionname = line.strip().strip('=').strip() if currentsectionname in sectionentries: print('Warning: duplicate section name {!r} on page {}'.format(currentsectionname, wlist.title())) if lines: lines.append('') lines.append(line.strip()) elif line.strip(): currentsectionentries.append(parselistline(line)) endsection() lines = '\n'.join(lines) if wlist.text != lines: wlist.text = lines wlist.save("BOT - Sorting list") return sectionentries def main(): atsite = pywikibot.Site('en', 'ArchiveTeam') cat = pywikibot.Category(atsite, "Category:ArchiveBot") gen = pagegenerators.CategorizedPageGenerator(cat, start="!") pre = pagegenerators.PreloadingGenerator(gen) listlenlimit = 1000 for page in pre: wtitle = page.title() wtext = page.text if len(sys.argv)>1 and not sys.argv[1] in wtitle: continue if not wtitle.startswith('ArchiveBot/'): continue wlist = pywikibot.Page(atsite, '%s/list' % (wtitle)) if not wlist.exists(): print("Page %s/list doesnt exist" % (wtitle)) continue sectionentries = curateurls(wlist=wlist) print('\n===', wtitle, '===') if (not '' in wtext and not '' in wtext: print("No tag. Skiping...") continue if len(wlist.text.splitlines()) > listlenlimit: continue newtext = [] totaljobsize = 0 totalsaved = 0 totalnotsaved = 0 # Find blocks of page text that end with a bot tag blocks = wtext.split('') # The last block must be tag-free, so only iterate over the previous ones for block in blocks[:-1]: # Find beginning of bot tag pos = block.find('') if pos == -1: pos = block.find('') continue if block[pos:].startswith(''): # Sectionless tag, use section None section = None openingtag = '' elif block[pos:].startswith('', pos) if openend == -1: print("Block's opening tag does not have an end, skipping...") newtext.append(block) newtext.append('') continue section = block[pos + 9:openend].strip() # 9 = len('') continue if section not in sectionentries: print('Block references section {!r} which does not exist, skipping...'.format(section)) newtext.append(block) newtext.append('') continue # Add prefixed text (if any) newtext.append(block[:pos]) # Add opening tag (as it was before) newtext.append(openingtag) # Generate table c = 1 rowsplain = "" sectionjobsize = 0 sectionhasnotes = any(entry.note is not None for entry in sectionentries[section]) for entry in sectionentries[section]: viewerplain = '' viewerdetailsplain = '' viewer = [archiveteamfun.getArchiveDetails(url=entry.url)] if viewer[0][0]: viewerplain = "{{saved}}" viewerdetailsplain = viewer[0][1] sectionjobsize += viewer[0][2] else: viewerplain = "{{notsaved}}" viewerdetailsplain = '' rowspan = len(re.findall(r'\|-', viewerdetailsplain))+1 rowspanplain = 'rowspan=%d | ' % (rowspan) if rowspan>1 else '' if entry.label: urllabel = '{{URLAB|1=%s|2=%s}}' % (entry.url, entry.label) else: urllabel = '{{URLAB|1=%s}}' % (entry.url) if sectionhasnotes: notescolumn = '%s%s || ' % (rowspanplain, entry.note if entry.note is not None else '') else: notescolumn = '' rowsplain += "\n|-\n| %s%s || %s%s%s\n%s " % (rowspanplain, urllabel, notescolumn, rowspanplain, viewerplain, viewerdetailsplain if viewerdetailsplain else '| || || || || || ') c += 1 totaljobsize += sectionjobsize sectionsaved = rowsplain.count('{{saved}}') totalsaved += sectionsaved sectionnotsaved = rowsplain.count('{{notsaved}}') totalnotsaved += sectionnotsaved notesheader = 'rowspan=2 | Notes !! ' if sectionhasnotes else '' output = """ * '''Statistics''': {{saved}} (%s){{·}} {{notsaved}} (%s){{·}} Total size (%s) Do not edit this table, it is automatically updated by bot. There is a [[{{FULLPAGENAME}}/list|raw list]] of URLs that you can edit. {| class="wikitable sortable plainlinks" ! rowspan=2 | Website !! %srowspan=2 | Status !! colspan=6 | Archive details |- ! AB Mode !! Domain !! Job !! Date !! Size !! Objects %s |} """ % (sectionsaved, sectionnotsaved, archiveteamfun.convertsize(b=sectionjobsize), notesheader, rowsplain) newtext.append(output) newtext.append('') # Add the last, tag-free block newtext.append(blocks[-1]) newtext = ''.join(newtext) # Replace total statistics if necessary if '' in newtext: newtext = re.sub(r'.*?', "'''Statistics''': {{saved}} (%s)){{·}} {{notsaved}} (%s){{·}} Total size (%s)" % (totalsaved, totalnotsaved, archiveteamfun.convertsize(b = totaljobsize)), newtext) if wtext != newtext: pywikibot.showDiff(wtext, newtext) page.text = newtext try: page.save("BOT - Updating page: {{saved}} (%s), {{notsaved}} (%s), Total size (%s)" % (totalsaved, totalnotsaved, archiveteamfun.convertsize(b=totaljobsize))) except: print("Error while saving...") else: print("No changes needed in", page.title()) archiveteamfun.cleanArchiveBotCache() if __name__ == '__main__': main()