User:Equinox/code/SortEnglishWordsByLength

From Wiktionary, the free dictionary
Jump to navigation Jump to search
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;


namespace SortEnglishWordsByLength
{
    /// <summary>
    /// Given a tab-separated multi-language titles-only dump of the kind found here:
    /// https://tools-static.wmflabs.org/templatehoard/entry_index/
    /// creates a series of files 1.txt, 2.txt, 3.txt etc. each listing all the words of 1, 2, 3... letters long.
    /// </summary>
    class Program
    {
        private const string INPUT_FILE = "c:/users/home/desktop/20221001.txt";
        private const string OUTPUT_FOLDER = "c:/users/home/desktop/output/";
        

        static void Main()
        {
            var wordsForLengths = new Dictionary<int, List<string>>();
            var all = File.ReadAllLines(INPUT_FILE); // lazy, inefficient :)

            foreach (string line in all)
            {
                var bits = line.Split('\t');
                if (!bits.Skip(1).Contains("en")) continue; // skip if not English

                string word = bits[0];

                if (IsWordlike(word))
                {
                    if (!wordsForLengths.ContainsKey(word.Length))
                    {
                        wordsForLengths[word.Length] = new List<string>();
                    }

                    wordsForLengths[word.Length].Add(word);
                }
            }

            Directory.CreateDirectory(OUTPUT_FOLDER);

            foreach (int len in wordsForLengths.Keys)
            {
                File.WriteAllLines(Path.Combine(OUTPUT_FOLDER, len + ".txt"), wordsForLengths[len].ToArray());
            }
        }


        static bool IsWordlike(string x)
        {
            foreach (char ch in x)
            {
                if (ch != '\'' && ch != '-' && !(ch >= 'a' && ch <= 'z') && !(ch >= 'A' && ch <= 'Z') && !(ch >= '0' && ch <= '9'))
                {
                    return false;
                }
            }

            return true;
        }
    }
}