Szerkesztő:BinBot/huwiki/article.py

Expanation / Magyarázat
"""
Main namespace stuff for Hungarian Wikipedia.

Although draft namespace also hosts article-like texts, they differ
from articles in several ways: have no categories, have no
Wikidata-item and are not flagged. As at one time there is a
limited number of drafts, the decision is to exclude them.

Articles will be subclassed by biographies.
"""
#
# (C) Bináris, 2023
#
# Distributed under the terms of the MIT license.

import re
from typing import Union
import pywikibot
from pywikibot.backports import cache, List, Set, Tuple
from pywikibot.textlib import extract_sections, does_text_contain_section
from pywikibot.exceptions import NoPageError

from .flaggedrevs import cosmeticable, flagged_state, Flags
from .category import CatProperties

site = pywikibot.Site()

# List of usual section titles at the end of the article such as sources.
# Useful if you want to avoid searching in aouter links, categories etc.
# The bot may skip the trailing part from the first match of these.
# Azoknak a szakaszcímeknek a listája, amelyeket el akarunk dobni.
# Hasznos, ha kereséskor nem akarsz egy csomó hamis találatot kapni
# pl. külső linkekből, kategóriákból stb.
# Az első illeszkedő címtől kezdve eldobja az egész végét, mint a retek ződjét.
# Ha nem felel meg, felülírható.
stopsections = [
    'Külső hivatkozások',
    'Források',
    'Jegyzetek',
    'Lásd még',
    'Kapcsolódó szócikkek',
    'További információk',
]

class Article(pywikibot.Page):
    """A page in main namespace.

    May be instantiated with either Page or title.
    """

    def __init__(self, title: Union[pywikibot.Page, str]) -> None:
        """Initializer."""
        if isinstance(title, pywikibot.Page):
            title = title.title()
        super(Article, self).__init__(site, title)
        if self.namespace() != 0:
            raise ValueError(f'Page must be in article namespace: {title}.')
        self._page_is_parsed = False  # We want to run parse3() only once.

    @property
    def creator(self) -> str:
        """Creator of the article as string."""
        return self.oldest_revision.user

    # ---------------------------------------------
    # Functions dealing with categories of the page
    # ---------------------------------------------

    def really_categorized(self) -> bool:
        """Determine if the article has real and useful categories."""
        for cat in self.categories():
            props = CatProperties(cat)
            if not props.is_not_real_categorization():
                return True
        return False
        
    @property
    def defsort(self) -> str:
        """Existing DEFAULTSORT key"""
        regex = re.compile(
            r'\{\{DEFAULTSORT:( *(.+?)'
            r'(\{\{([hH]amis digráf|[nN]em[GLNT]y|[nN]em[CZ]s|[nN]emSz)\}\}.+?)*)'
            r'\}\}')
        try:
            return regex.search(self.text).group(1)
        except AttributeError:
            return ''

    # ----------------------------------------
    # Functions dealing with types of articles
    # ----------------------------------------

    def is_person(self) -> bool:
        """ The article represents a person (it is a biography).

        To determine if an article is a biography (is about a person)
        is not easy and not exact. Sometimes the author of the article
        does not know when people and events are mixed within one page.
        This method is therefore not safe. It will _try_ to determine
        if the page is a bio by Wikidata, categories and infoboxes.

        Code is partially derived from
        https://gerrit.wikimedia.org/r/c/pywikibot/core/+/888791/
        (C) Author of copied part: Ayush Anand33

        This is not necessarily a fast method, use if you really need.
        """
        if not self.exists():
            return False

        # Is P31 Q5 in Wikidata?
        try:
            wd_item = pywikibot.ItemPage.fromPage(self)
            if wd_item.exists() and 'P31' in wd_item.claims:
                p31_claims = wd_item.claims['P31']
                for claim in p31_claims:
                    if claim.getTarget().getID() == 'Q5':
                        return True
                return False  # Has Wikidata-item, but is not a person
        except NoPageError:
            pass

        # Does it have person-related categories?
        for cat in self.categories():
            if cat.title().endswith(" személyek"):
                return True

        # Does it have person-related infobox?
        infoboxes = get_biography_infobox_templates()
        for t in self.templates():
            if t in infoboxes:
                return True
                
        # Does it have a biography template on its talk page?
        biography_template = pywikibot.Page(site, 'Sablon:WPÉletrajz')
        if biography_template in self.toggleTalkPage().templates():
            return True
        
        return False

    # ----------------------------------------
    # Functions dealing with flagged revisions
    # ----------------------------------------

    @property
    def flag(self) -> Flags:
        """FlaggedRevs state of the article (ellenőrzöttség)."""
        return flagged_state(self)

    def cosmeticable(self) -> bool:
        """Is suitable for cosmetic changes?"""
        return cosmeticable(self, force=True)

    # ----------------------------------------
    # Functions dealing with parts of the text
    # ----------------------------------------

    @property
    def header(self) -> str:
        """Header of the article before the first section."""
        if not self._page_is_parsed:
            self.parse3()
        return self._header

    @property
    def maintext(self) -> str:
        """Main text of the article (sections without header and footer)."""
        if not self._page_is_parsed:
            self.parse3()
        return self._maintext

    @property
    def trunctext(self) -> str:
        """Truncated text of the article above the first stopsection.
        
        This will exclude the trailing sections which are usually full 
        of external links which may be annoying when you search sg.
        Does not contain the header part, only from the first section.
        """
        if not self._page_is_parsed:
            self.parse3()
        return self._trunctext

    @property
    def footer(self) -> str:
        """Footer of the article (categories and interwikis)."""
        if not self._page_is_parsed:
            self.parse3()
        return self._footer

    @property
    def sections(self) -> List[Tuple[str, str]]:
        """Sections list from textlib (list of (header, content))."""
        if not self._page_is_parsed:
            self.parse3()
        return self._sections

    def parse3(self) -> None:
        """Fill (and possibly overwrite) page text parts."""
        content = extract_sections(self.text, site=site)

        # To get the original wikitext concatenate. This is True:
        # page.text == content.header
            # + ''.join([s.title + s.content for s in content.sections])
            # + content.footer
        self._header = content.header
        self._sections = content.sections
        self._maintext = ''.join([s.title + s.content for s in content.sections])
        self._footer = content.footer
        self._trunctext = ''
        for sec in content.sections:
            print(sec.title.strip('= '))
            if sec.title.strip('= ') in stopsections:
                print('Kuka')
                break
            self._maintext += sec.title + sec.content
        print(self._maintext)
        self._page_is_parsed = True

    def has_section(self, section: str) -> bool:
        """An easier use of the textlib function.

        True if the article has the given section title.
        See the details in textlib.py.
        """
        return does_text_contain_section(self.text, section)

@cache
def get_biography_infobox_templates() -> Set[pywikibot.Page]:
    """Returns all the templates which represent people.

    Code and text adopted from
    https://github.com/roysmith/dyk-tools/blob/60a0b1d5c6c5f7310a541f30388b898b4a906b10/dyk_tools/wiki/article.py#L1
    (C) Original author: Roy Smith

    This employs heuristics to navigate the infobox template categories.
    The exact rules are not well defined, so don't count on this returning
    exactly the same results every time.
    Because this is cached, a process restart or call to cache_clear()
    will be required to pick up any changes since the first invocation.
    Since these template categories change (very) slowly, that's not a
    problem in practice.
    """

    pages = set()
    cat = pywikibot.Category(site, "Személyek infoboxsablonjai")
    for t in cat.articles():  # No subcats!
        pages.add(t)
    return pages