summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--wikibot/discourse.py60
1 files changed, 60 insertions, 0 deletions
diff --git a/wikibot/discourse.py b/wikibot/discourse.py
new file mode 100644
index 0000000..80ca729
--- /dev/null
+++ b/wikibot/discourse.py
@@ -0,0 +1,60 @@
+import collections
+import pywikibot
+import re
+
+# made from: sed \
+# -e 's/^* \[//' -e 's/^* //' \
+# -e 's/^http//' -e 's/^s//' \
+# -e 's*^://**' -e 's/\..*//' \
+# wikientries_joined.txt |\
+# sort | uniq -c | sort -hr
+# and then manually curated the subdomain part of the regex
+subdomainPattern = '((ask|bbs|chat|community|comunidad|disc(ourse|uss(ions?|))|for(o|ums?)|help|support|www)\\.)'.replace('(','(?:')
+extractionPattern = re.compile('(?:^\\* \\[?ht)[^:/]+://'+ subdomainPattern +'?(\\S+)')
+
+def get_cleaned_url(line):
+ # Extract the domain from the first URL appearing on a line, stripping away a leading "www." if any; returns None if no URL is found
+ match = extractionPattern.search(line)
+ if match:
+ return match.group(1).lower()
+ return None
+
+
+def handle_page(site, page):
+ # Extract domains from lines
+ entries = collections.deque((line, get_cleaned_url(line)) for line in page.text.split('\n'))
+
+ # Identify blocks of URLs and sort them
+ entries.append((None, None)) # Dummy entry at the end to trigger a last sorting if necessary
+ output = []
+ currentBlock = []
+ urlCount = 0
+ while entries:
+ line, url = entries.popleft()
+ if url is None:
+ # Either a line without a URL or the dummy entry at the end
+ if currentBlock:
+ currentBlock.sort(key = lambda x: x[1])
+ output.extend(x[0] for x in currentBlock)
+ urlCount += len(currentBlock)
+ currentBlock = []
+ if line is not None: # Ignore the dummy entry
+ output.append(line)
+ elif line is not None:
+ # line and url are not None, i.e. this is a line with a URL in it
+ currentBlock.append((line, url))
+
+ outputStr = '\n'.join(output)
+
+ # Update if necessary
+ if page.text != outputStr:
+ site.login()
+ page.text = outputStr
+ page.save("Reordered websites")
+
+def main():
+ site = pywikibot.Site('en', 'ArchiveTeam')
+ for page in site.allpages(prefix = 'Discourse'):
+ handle_page(site, page)
+
+main()