User:Equinox/code/SortEnglishWordsByLength
Jump to navigation
Jump to search
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
namespace SortEnglishWordsByLength
{
/// <summary>
/// Given a tab-separated multi-language titles-only dump of the kind found here:
/// https://tools-static.wmflabs.org/templatehoard/entry_index/
/// creates a series of files 1.txt, 2.txt, 3.txt etc. each listing all the words of 1, 2, 3... letters long.
/// </summary>
class Program
{
private const string INPUT_FILE = "c:/users/home/desktop/20221001.txt";
private const string OUTPUT_FOLDER = "c:/users/home/desktop/output/";
static void Main()
{
var wordsForLengths = new Dictionary<int, List<string>>();
var all = File.ReadAllLines(INPUT_FILE); // lazy, inefficient :)
foreach (string line in all)
{
var bits = line.Split('\t');
if (!bits.Skip(1).Contains("en")) continue; // skip if not English
string word = bits[0];
if (IsWordlike(word))
{
if (!wordsForLengths.ContainsKey(word.Length))
{
wordsForLengths[word.Length] = new List<string>();
}
wordsForLengths[word.Length].Add(word);
}
}
Directory.CreateDirectory(OUTPUT_FOLDER);
foreach (int len in wordsForLengths.Keys)
{
File.WriteAllLines(Path.Combine(OUTPUT_FOLDER, len + ".txt"), wordsForLengths[len].ToArray());
}
}
static bool IsWordlike(string x)
{
foreach (char ch in x)
{
if (ch != '\'' && ch != '-' && !(ch >= 'a' && ch <= 'z') && !(ch >= 'A' && ch <= 'Z') && !(ch >= '0' && ch <= '9'))
{
return false;
}
}
return true;
}
}
}