Szerkesztő:Tgr/catsort.py

A Wikipédiából, a szabad enciklopédiából
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Add default sort key to pages with non-ASCII names
"""
# (C) Tisza Gergő ( http://hu.wikipedia.org/wiki/User:Tgr ) 2008
# Distributed under the terms of the MIT license

__version__ = '$Id'

import re, operator
import wikipedia, pagegenerators

class CatsortBot(object):
  magicword_defaultsort_common = [u"DEFAULTSORT", u"DEFAULTSORTKEY", u"DEFAULTCATEGORYSORT"];
  magicword_defaultsort_local = {
    'en': [],
    'hu': [u"RENDEZÉS", u"KULCS"],
  }
  sortStringRules = {
    # mapping of characters to strings
    # one-to-one mappings can be merged into a single string pair
    # characters that should be deleted can also be unified into a string and mapped to None
    'en': {},
    'hu': {u'áéíóú': 'aeiou', u'ö': 'o~', u'ő': 'o~', u'ú': 'u', u'ü': 'u~', u'ű': 'u~'},
  }
  ignoredPrefixes = {
    'en': [],
    'hu': ['a ', 'az '],
  }
  ignoredPostfixes = {
    'en': [],
    'hu': [u'/Új'],
  }
  summary_msg = {
    'en': u'Bot adding default sortkey: %s',
    'hu': u'Robot: alapértelmezett rendezés kulcs hozzáadása: %s',
  }
  
  def __init__(self):
    self.options = {
      'force': False,
      'test': False,
      'verbose': False,
    }
    
    self.defaultSortPattern = re.compile('^(' + '|'.join(self.magicword_defaultsort_common + wikipedia.translate(wikipedia.getSite(), self.magicword_defaultsort_local)) + '):')
    self.categoryPattern = re.compile(ur"^\[\[(%s):" % ('|'.join(wikipedia.getSite().category_namespaces())), re.MULTILINE)
    self.interwikiPattern = re.compile(ur"^\[\[(%s):" % ('|'.join(wikipedia.getSite().validLanguageLinks())), re.MULTILINE)
    self.nonAlNum = re.compile(r"[^\w\d]", re.UNICODE)

    self.maketrans(wikipedia.translate(wikipedia.getSite(), self.sortStringRules))

  def maketrans(self, rules):
    self.translationTable = {}
    for rule in rules:
      if rules[rule] == None:
        for char in rule:
          self.translationTable[ord(char)] = None
      elif len(rule) == 1:
        self.translationTable[ord(rule)] = rules[rule] + u''
      else:
        for i in range(len(rule)):
          self.translationTable[ord(rule[i])] = rules[rule][i]  + u''

  def translate(self, str):
    return str.translate(self.translationTable)

  def sortString(self, pageName):
    pageName = pageName + u'' # make sure pageName is unicode to get rid of the confusing " expected a character buffer object" error
    for word in wikipedia.translate(wikipedia.getSite(), self.ignoredPrefixes):
      pageName = re.sub('^'+word, '', pageName)
    for word in wikipedia.translate(wikipedia.getSite(), self.ignoredPostfixes):
      pageName = re.sub(word+'$', '', pageName)
    pageName = re.sub('\s*\(.*?\)$', '', pageName) # different meanings of the same word should stick together
    pageName = self.nonAlNum.sub('', pageName)
    return self.translate(pageName.lower()).capitalize()

  def addDefaultSort(self, page, sort_redirect = False):
    oldtext = page.get(get_redirect = True)
    sortkey = "{{DEFAULTSORT:%s}}" % self.sortString(page.title())
    if self.categoryPattern.search(oldtext):
      text = self.categoryPattern.sub(r"%s\n[[\1:" % sortkey, oldtext, 1)
    elif self.interwikiPattern.search(oldtext):
      text = self.interwikiPattern.sub(r"%s\n\n[[\1:" % sortkey, oldtext, 1)
    else:
      text = oldtext + "\n\n" + sortkey
    # page.put(text, wikipedia.translate(self.summary_msg) % sortkey)
    wikipedia.output("+++++++++++++++++++++++++++++++++++\n+++++++++++++++++++++++++++++++++++\n"+text) #DEBUG
    wikipedia.showDiff(oldtext, text) #DEBUG
    return #DEBUG

  def run(self):
    gen = None
    genFactory = pagegenerators.GeneratorFactory()
    for arg in wikipedia.handleArgs():
      # TODO: force, test, verbose
      generator = genFactory.handleArg(arg)
      if generator:
        gen = generator
    if gen:
      pregen = pagegenerators.PreloadingGenerator(gen)
      for page in pregen:
        try:
          wikipedia.output(page.title()) # DEBUG
          wikipedia.output(str(page.templates())) #DEBUG
          if not page.exists():
            continue
          if re.search('^[\w\d ]$', page.title()):
            continue
          if page.isRedirectPage() and len(page.categories(get_redirect=True)) == 0:
            continue
          if reduce(operator.__or__, [bool(self.defaultSortPattern.match(keyword)) for keyword in page.templates()], False):
            continue
          if page.namespace() == 10: # template
            continue
          self.addDefaultSort(page, sort_redirect=True)
        except wikipedia.NoPage:
          continue
    else:
      wikipedia.showHelp('pagegenerators')


if __name__ == "__main__":
  try:
    bot = CatsortBot()
    bot.run()
  finally:
    wikipedia.stopme()