Szerkesztő:BinBot/checkwiki.py

Tudnivalók
Ez a script a Wikipédia:Ellenőrzőműhely munkájának megkönnyítéséhez készült. Például a 94-es hibakódhoz tartozó címlistát így tudod lementeni egy állományba:
checkwiki.py -id:94 -save:check94.txt. A kapott listát fel is töltheted egy allapra, így bármilyen bottal lehet dolgozni belőle. Lásd még: Wikipédia:Botgazdák üzenőfala/Archív 33#Ellenőrzőműhely
# coding: utf-8
"""
This script will download article lists from Check Wikipedia Project (see
https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Check_Wikipedia and
https://tools.wmflabs.org/checkwiki/cgi-bin/checkwiki.cgi).
The result will be stored as a Pywikibot-readable page list in a file, and may
be uploaded to a wikipage as it is.
However, this is not a Pywikibot script, rather a standalone Python program,
but it may be imported to another script (pass arguments to main()).
In this case you may ask a title generator, that may be turned into Pywikibot
Page objects.

Arguments:
  -project: Project name as shown at the above webpage. Defaults to huwiki. :-)
  -id:      Task ID. Must be a number. No default, mandatory.
  -save:    Name of the file where we save the result to.
            No default, mandatory (except when you provide -list, in which case
            file name will be discarded if present).
            Will append if the file exists.
  -offset:  Download offset. Must be a number. Defaults to zero.
  -list     Returns a title generator to the caller script.
  -help     This text.
Example: python checkwiki.py -project:enwiki -id:94 -save:check94.txt
Example for import as a generator:
  gen = checkwiki.main('-project:enwiki', '-id:94', '-list')
  (Will raise a TypeError for wrong parameters.)

Tested both under Python 3.6.1 and Python 2.7.9.
"""

# (C) (D) (E) (F) (G) (A) (H) (C) hu:user:Bináris. Use as is, have fun!

import sys
import re
import codecs
try:
    import urllib.request as urllib # Python 3
except ImportError:
    import urllib2 as urllib # Python 2
# For Python2 compatibility (FileNotFoundError is defined only in Python3):
try:
    FileNotFoundError
except NameError:
    FileNotFoundError = IOError
version = sys.version_info[0]

baseURL = 'https://tools.wmflabs.org/checkwiki/cgi-bin/checkwiki.cgi?' \
          + 'project={0}&view=bots&id={1}&offset={2}'

class Reader(object):
    def __init__(self, url, file):
        self.url = url
        self.file = file
    
    def process(self):
        try:
            response = urllib.urlopen(self.url)
        except urllib.URLError: # Also catches HTTPError.
            print('Could not open the resource.')
            return []
        if version == 3:
            html = str(response.read().decode('utf-8'))
        else:
            html = response.read().decode('utf-8')
        try:
            titles = re.search(r'(?s)<pre>(.*?)</pre>', html).group(1)
        except AttributeError:
            print('Structure of the source has changed, cannot parse.')
            return []
        return [t for t in titles.split('\n') if bool(t)]
        
    def run(self):
        if not self.file:
            return
        titleList = self.process()
        res = '\n'.join([u'* [[{0}]]'.format(t) for t in titleList])
        if len(res):
            self.file.write(res + '\n')
        else:
            print('No todos found at this task, all done.')
        
    def generate(self):
        for t in self.process():
            yield t
   
def main(*args):
    project = 'huwiki'
    offset = '0'
    mode = None
    file = None
    
    if not args:
        args = sys.argv[1:]
    for arg in args:
        if arg.startswith('-project:'):
            project = arg[len('-project:'):]
        elif arg.startswith('-id:'):
            id = arg[len('-id:'):]
            if not re.match(r'\d+$', id):
                print('Malformed id: ' + id)
                return
        elif arg.startswith('-offset:'):
            offset = arg[len('-offset:'):]
            if not re.match(r'\d+$', offset):
                print('Malformed offset: ' + offset)
                return
        elif arg.startswith('-save:'):
            save = arg[len('-save:'):]
        elif arg == '-list':
            mode = 'list'
        elif arg == '-help':
            print(__doc__)
            return
    try:
        url = baseURL.format(project, id, offset)
    except UnboundLocalError:
        print('Missing task id')
        print(__doc__)
        return
    if mode != 'list':
        try:
            file = codecs.open(save, 'a', 'utf-8')
        except UnboundLocalError:
            print('Missing file name')
            print(__doc__)
            return
        except (IOError, FileNotFoundError, OSError):
            print('Could not open the output file with name ' + save)
            return
    try:
        reader = Reader(url, file)
        if mode == 'list': # Called from another script as a generator.
            return reader.generate()
        else: # Standalone mode.
            reader.run()
    finally:
        if file:
            file.close()

if __name__ == '__main__':
    main()