import collections import pywikibot import re extractionPattern = re.compile('[^:/]+://(?:www\\.)?(\\S+)') countMarkBegin = '' countMarkEnd = '' def get_cleaned_url(line): # Extract the domain from the first URL appearing on a line, stripping away a leading "www." if any; returns None if no URL is found match = extractionPattern.search(line) if match: return match.group(1).lower() return None def handle_page(site, page): # Extract domains from lines entries = collections.deque((line, get_cleaned_url(line)) for line in page.text.split('\n')) # Identify blocks of URLs and sort them entries.append((None, None)) # Dummy entry at the end to trigger a last sorting if necessary output = [] currentBlock = [] urlCount = 0 while entries: line, url = entries.popleft() if url is None: # Either a line without a URL or the dummy entry at the end if currentBlock: currentBlock.sort(key = lambda x: x[1]) output.extend(x[0] for x in currentBlock) urlCount += len(currentBlock) currentBlock = [] if line is not None: # Ignore the dummy entry output.append(line) elif line is not None: # line and url are not None, i.e. this is a line with a URL in it currentBlock.append((line, url)) outputStr = '\n'.join(output) if countMarkBegin in outputStr and countMarkEnd in outputStr: countMarkBeginPos = outputStr.index(countMarkBegin) countMarkEndPos = outputStr.find(countMarkEnd, countMarkBeginPos) # End mark could be before begin mark if countMarkEndPos != -1: outputStr = outputStr[:countMarkBeginPos] + countMarkBegin + 'This list currently contains ' + str(urlCount) + ' URL' + ('s' if urlCount != 1 else '') + '.' + countMarkEnd + outputStr[countMarkEndPos + len(countMarkEnd):] # Update if necessary if page.text != outputStr: site.login() page.text = outputStr page.save("Reordered websites and/or updated count.") def main(): site = pywikibot.Site('en', 'ArchiveTeam') for page in site.allpages(prefix = 'List of websites excluded from the Wayback Machine'): handle_page(site, page) main()