User:Surjection/updateflagcssbot.py

From Wiktionary, the free dictionary
Jump to navigation Jump to search
import re
import hashlib
from pywikibot import Site, Page


# The pages to read from / write to.

get_page_title = "Wiktionary:Language flags list"
set_page_title = "MediaWiki:Gadget-WiktCountryFlags.css"
#     because this modifies global CSS, interface admin rights are required

# The L2 order. The following languages are always placed first, in this order.
always_first_l2 = ["Translingual", "English"]

# The CSS header.

css_header_template = """/*
 *  flags.css - adds flags to language headers.
 *  originally created by [[User:Prince Kassad]]
 *  this version is auto-generated from the list at [[{get_page_title}]]
 *  ([[Special:Permalink/{get_page_revision_id}]])
 */

"""

# The CSS rules to create for every language.

selector_template = ".ns-0 h2 #{language}::before, .ns-0 h2#{language}::before"
rule_template = """{selectors}{{padding-right:5px;
 content:url({url});}}"""

# Regex to iterate over the language names and filenames
# in the CSS code produced by this script

css_language_filename_regex = (r"""h2 \#([^:]+)::before|\{[^;]+;\s*content:url\(['"]"""
    + re.escape("//upload.wikimedia.org/wikipedia/commons/thumb/")
    + r"[0-9a-f]/[0-9a-f]{2}/([^/]+)")

# Other settings.

default_size = "45px"
edit_summary = "(bot) update CSS based on flag list from [[" + get_page_title + "]]"
debug_mode = False

# Format for the input list.

list_regex = r"<!-- start list -->(.+?)<!-- end list -->"
line_regex = r"(.+?): (.+?)(?: (\d+px))?$"

# Code follows

size_dict = {}  # to be defined later


def convert_line_to_item(match):
    if not match:
        return None
    return (match.group(1), match.group(2).replace("_", " "), match.group(3) or default_size)


def get_url_from_file(file):
    size = size_dict.get(file, default_size)
    file = file.replace(" ", "_")
    thumbfile = file

    # SVG files do not have thumbnails, get as PNG
    if thumbfile.endswith(".svg"):
        thumbfile += ".png"

    # right now the thumbnail path has parts of the MD5 hash of the file name
    # this may change one day
    md5 = hashlib.md5(file.encode('utf-8')).hexdigest().lower()

    # make protocol-relative URL
    return ("//upload.wikimedia.org/wikipedia/commons/thumb/"
            "{l1}/{l2}/{file}/{size}-{thumbfile}").format(
        size=size, file=file, thumbfile=thumbfile, l1=md5[:1], l2=md5[:2])


def quote_url_for_css(url):
    # use double quotes if URL has single quotes
    quote_character = '"' if "'" in url else "'"
    # escape double quotes anyway
    return "{0}{1}{0}".format(quote_character, url.replace('"', '%22'))


def make_css_rule(l2s, file):
    selectors = [selector_template.format(language=l2.replace(" ", "_").replace("'", r"\'")) for l2 in l2s]
    return rule_template.format(selectors=', '.join(selectors),
                                url=quote_url_for_css(get_url_from_file(file)))


def deduplicate(array):
    """Remove duplicates from a list but keeps its order of elements. All
    but the first copy of any element is removed."""
    seen = set()
    return [x for x in array if not (x in seen or seen.add(x))]


def reverse_lookup(dict, value):
    """Return all keys that correspond to a specific value on a dictionary."""
    return [key for key in dict.keys() if dict[key] == value]


def search_matching(array, predicate, start=0, stop=2147483647):
    """
    Find the first index in the array for which the predicate taking in the
    index and the corresponding item returns a truthy value, or -1 if none do.
    """
    try:
        return next(index for index, value
                    in enumerate(array[start:stop], start=start)
                    if predicate(index, value))
    except StopIteration:
        return -1


def convert_flag_dict_to_css(flags):
    # convert flag dictionary to list of tuples (lang, file) sorted by lang
    sorted_flags = list(sorted(flags.items(), key=lambda t: t[0]))

    # bring configured L2s to front
    for l2 in reversed(always_first_l2):
        # try to find matching index
        actual_index = search_matching(sorted_flags, lambda i, v: v[0] == l2)
        if actual_index > 0: # found?
            # move index to front of list
            sorted_flags = ([sorted_flags[actual_index]]
                            + sorted_flags[:actual_index]
                            + sorted_flags[actual_index + 1:])

    # remove duplicate files from the file list...
    file_list = deduplicate(file for _, file in sorted_flags)
    # ...and then make a reverse {file: [lang, lang, ...]} dictionary with it
    dedup_flags = {file: reverse_lookup(flags, file) for file in file_list}
    # convert the above dict into sets of rules and return them
    return "\n\n".join(make_css_rule(l2s, file) for file, l2s
                       in dedup_flags.items())


def get_flag_dict_from_css(css_text):
    """Read CSS page to get dict of language to filename."""
    flag_dict = {}
    language_names = []
    for match in re.finditer(css_language_filename_regex, css_text):
        if match[1]:
            language_names.append(match[1].replace("_", " ").replace(r"\'", "'"))
        elif match[2]:
            filename = match[2]
            for name in language_names:
                # reverse filename changes
                flag_dict[name] = filename.replace("_", " ").replace("%22", '"')
            language_names = []
    return flag_dict


def get_flag_dict_diffs(old_flag_dict, new_flag_dict):
    # added, changed, removed
    return (
        {lang: filename for lang, filename in new_flag_dict.items() if lang not in old_flag_dict},
        {lang: filename for lang, filename in old_flag_dict.items() if lang in new_flag_dict and new_flag_dict[lang] != filename},
        {lang: filename for lang, filename in old_flag_dict.items() if lang not in new_flag_dict}
    )


def utf8_len(s):
    return len(s.encode('utf-8'))


def add_flag_change_summary(edit_summary, css_text, new_flag_dict):
    old_flag_dict = get_flag_dict_from_css(css_text)
    added, changed, removed = get_flag_dict_diffs(old_flag_dict, flag_dict)

    messages = []

    def add_lang_message(verb, flag_dict):
        if len(flag_dict) > 0:
            messages.append(
                verb + " "
                + ", ".join(lang for lang, filename in flag_dict.items()))

    add_lang_message("add", added)
    add_lang_message("change", changed)
    add_lang_message("remove", removed)

    edit_summary_addition = "; ".join(messages)
    if len(edit_summary_addition) == 0:
        return edit_summary + ": no changes identified"
    else:
        new_summary = edit_summary + ": " + edit_summary_addition
        if utf8_len(new_summary) <= 800:
            return new_summary
        else:
            return edit_summary + ": too many changes to list"


enwikt = Site("en", fam="wiktionary")
enwikt.login()
get_page = Page(enwikt, get_page_title)
get_page_revision_id = get_page.latest_revision_id

list_of_langs = re.search(list_regex, get_page.text, re.DOTALL).group(0)

flag_list = [convert_line_to_item(re.match(line_regex, line))
             for line in list_of_langs.splitlines()]
flag_list = [x for x in flag_list if x]
size_dict = {filename: size for language, filename, size in flag_list}
flag_dict = {language: filename for language, filename, size in flag_list}

css_header = css_header_template.format(get_page_title = get_page_title, get_page_revision_id = get_page_revision_id)
css = css_header + convert_flag_dict_to_css(flag_dict)

set_page = Page(enwikt, set_page_title)
edit_summary = add_flag_change_summary(edit_summary, set_page.text, flag_dict)

if debug_mode:
    print(css + "\n\n----\n\nsummary:\n" + edit_summary)
else:
    set_page.text = css
    set_page.save(summary=edit_summary, minor=False)