#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Add default sort key to pages with non-ASCII names
"""
# (C) Tisza Gergő ( http://hu.wikipedia.org/wiki/User:Tgr ) 2008
# Distributed under the terms of the MIT license
__version__ = '$Id'
import re, operator
import wikipedia, pagegenerators
class CatsortBot(object):
magicword_defaultsort_common = [u"DEFAULTSORT", u"DEFAULTSORTKEY", u"DEFAULTCATEGORYSORT"];
magicword_defaultsort_local = {
'en': [],
'hu': [u"RENDEZÉS", u"KULCS"],
}
sortStringRules = {
# mapping of characters to strings
# one-to-one mappings can be merged into a single string pair
# characters that should be deleted can also be unified into a string and mapped to None
'en': {},
'hu': {u'áéíóú': 'aeiou', u'ö': 'o~', u'ő': 'o~', u'ú': 'u', u'ü': 'u~', u'ű': 'u~'},
}
ignoredPrefixes = {
'en': [],
'hu': ['a ', 'az '],
}
ignoredPostfixes = {
'en': [],
'hu': [u'/Új'],
}
summary_msg = {
'en': u'Bot adding default sortkey: %s',
'hu': u'Robot: alapértelmezett rendezés kulcs hozzáadása: %s',
}
def __init__(self):
self.options = {
'force': False,
'test': False,
'verbose': False,
}
self.defaultSortPattern = re.compile('^(' + '|'.join(self.magicword_defaultsort_common + wikipedia.translate(wikipedia.getSite(), self.magicword_defaultsort_local)) + '):')
self.categoryPattern = re.compile(ur"^\[\[(%s):" % ('|'.join(wikipedia.getSite().category_namespaces())), re.MULTILINE)
self.interwikiPattern = re.compile(ur"^\[\[(%s):" % ('|'.join(wikipedia.getSite().validLanguageLinks())), re.MULTILINE)
self.nonAlNum = re.compile(r"[^\w\d]", re.UNICODE)
self.maketrans(wikipedia.translate(wikipedia.getSite(), self.sortStringRules))
def maketrans(self, rules):
self.translationTable = {}
for rule in rules:
if rules[rule] == None:
for char in rule:
self.translationTable[ord(char)] = None
elif len(rule) == 1:
self.translationTable[ord(rule)] = rules[rule] + u''
else:
for i in range(len(rule)):
self.translationTable[ord(rule[i])] = rules[rule][i] + u''
def translate(self, str):
return str.translate(self.translationTable)
def sortString(self, pageName):
pageName = pageName + u'' # make sure pageName is unicode to get rid of the confusing " expected a character buffer object" error
for word in wikipedia.translate(wikipedia.getSite(), self.ignoredPrefixes):
pageName = re.sub('^'+word, '', pageName)
for word in wikipedia.translate(wikipedia.getSite(), self.ignoredPostfixes):
pageName = re.sub(word+'$', '', pageName)
pageName = re.sub('\s*\(.*?\)$', '', pageName) # different meanings of the same word should stick together
pageName = self.nonAlNum.sub('', pageName)
return self.translate(pageName.lower()).capitalize()
def addDefaultSort(self, page, sort_redirect = False):
oldtext = page.get(get_redirect = True)
sortkey = "{{DEFAULTSORT:%s}}" % self.sortString(page.title())
if self.categoryPattern.search(oldtext):
text = self.categoryPattern.sub(r"%s\n[[\1:" % sortkey, oldtext, 1)
elif self.interwikiPattern.search(oldtext):
text = self.interwikiPattern.sub(r"%s\n\n[[\1:" % sortkey, oldtext, 1)
else:
text = oldtext + "\n\n" + sortkey
# page.put(text, wikipedia.translate(self.summary_msg) % sortkey)
wikipedia.output("+++++++++++++++++++++++++++++++++++\n+++++++++++++++++++++++++++++++++++\n"+text) #DEBUG
wikipedia.showDiff(oldtext, text) #DEBUG
return #DEBUG
def run(self):
gen = None
genFactory = pagegenerators.GeneratorFactory()
for arg in wikipedia.handleArgs():
# TODO: force, test, verbose
generator = genFactory.handleArg(arg)
if generator:
gen = generator
if gen:
pregen = pagegenerators.PreloadingGenerator(gen)
for page in pregen:
try:
wikipedia.output(page.title()) # DEBUG
wikipedia.output(str(page.templates())) #DEBUG
if not page.exists():
continue
if re.search('^[\w\d ]$', page.title()):
continue
if page.isRedirectPage() and len(page.categories(get_redirect=True)) == 0:
continue
if reduce(operator.__or__, [bool(self.defaultSortPattern.match(keyword)) for keyword in page.templates()], False):
continue
if page.namespace() == 10: # template
continue
self.addDefaultSort(page, sort_redirect=True)
except wikipedia.NoPage:
continue
else:
wikipedia.showHelp('pagegenerators')
if __name__ == "__main__":
try:
bot = CatsortBot()
bot.run()
finally:
wikipedia.stopme()