User:SnowyCinema/findwords.py

From Wiktionary, the free dictionary
Jump to navigation Jump to search
import re

file_to_process = open("file_to_process.txt", "r")
list_of_possible_entries = []

for line in file_to_process:
    # takes out all symbols that should never appear in entries
    line = line.rstrip()
    line = re.sub('\[', ' ', line)
    line = re.sub('\]', ' ', line)
    line = re.sub('\(', ' ', line)
    line = re.sub('\)', ' ', line)
    line = re.sub('\{', ' ', line)
    line = re.sub('\}', ' ', line)
    line = re.sub('\/', ' ', line)
    line = re.sub('\\\\', ' ', line)
    line = re.sub('\!', ' ', line)
    line = re.sub('\?', ' ', line)
    line = re.sub('\…', ' ', line)
    line = re.sub('\"', ' ', line)
    line = re.sub('\:', ' ', line)
    line = re.sub('\;', ' ', line)
    line = re.sub('\>', ' ', line)
    line = re.sub('\<', ' ', line)
    line = re.sub('\|', ' ', line)
    line = re.sub('\*', ' ', line)
    line = re.sub('\•', ' ', line)
    line = re.sub('\.', ' ', line)
    line = re.sub('\—', ' ', line)
    line = re.sub('\–', ' ', line)
    line = re.sub('\,', ' ', line)
    line = re.sub('\[', ' ', line)
    line = re.sub('\=', ' ', line)
    line = re.sub('--', ' ', line)
    line = re.sub('”', ' ', line)
    line = re.sub('“', ' ', line)
    line = re.sub('@', ' ', line)
    line = re.sub('_', ' ', line)
    line = re.sub('#', ' ', line)
    line = re.sub('©', ' ', line)
    line = re.sub('\$', ' ', line)
    line = re.sub('¢', ' ', line)
    line = re.sub('™', ' ', line)
    line = re.sub('®', ' ', line)
    line = re.sub('♂', ' ', line)
    line = re.sub('♀', ' ', line)
    line = re.sub('⚥', ' ', line)
    line = re.sub('\+', ' ', line)
    line = re.sub('~', ' ', line)
    line = re.sub('«', ' ', line)
    line = re.sub('»', ' ', line)
    line = re.sub('‹', ' ', line)
    line = re.sub('›', ' ', line)
    line = re.sub('\^', ' ', line)
    line = re.sub("’", "'", line)
    line = re.sub("‘", "'", line)
    line = re.sub("⚪", "'", line)
    line = line.split()
    for item in line:
        #checks for integers and floats
        def RepresentsInt(x):
            try:
                int(x)
                return True
            except ValueError:
                return False
        def normal(x):
            #checks for one-letter entries and duplicates
            if x not in list_of_possible_entries and RepresentsInt(x) == False and not x.endswith("-"):
                if len(x) > 1:
                    list_of_possible_entries.append(x)
                    # add lowercase item along with uppercase item
                    if x[0].isupper():
                        list_of_possible_entries.append(x.lower())
                        if x.isupper():
                            list_of_possible_entries.append(x.lower().capitalize())
        normal(item)
        if "-" in item:
            unhyphenated = re.sub("-", "", item)
            normal(unhyphenated)
            withinhyphen = item.split("-")
            for term in withinhyphen:
                normal(term)
            spacehyphen = re.sub('-', ' ', item)
            normal(spacehyphen)
        if item.endswith("'s"):
            newapostropheitem = item[:-2]
            normal(newapostropheitem)
        if item.startswith("'") or item.endswith("'"):
            noapostropheitem = item.strip("'")
            normal(noapostropheitem)

list_of_possible_entries.sort(key=str.casefold)

generated_list = open("generated_list.txt", "w+")

for word in list_of_possible_entries:
    l_en = "{{l|en|" + word + "}}\n\n"
    generated_list.write(l_en)