User:Equinox/Antiblue

Definition from Wiktionary, the free dictionary
Jump to: navigation, search
using System;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using DotNetWikiBot;


namespace Antiblue
{
    /// <summary>
    /// Removes the unadorned "blue links" from a Wiktionary text.
    /// </summary>
    class Program
    {
        private const string FILE_INPUT = @"C:\Users\home\Desktop\input.txt";

        private const string USER = "Equinox";

        private const string LANGUAGE_NAME = "English";

        private const int SLEEPY_TIME = 1000; // milliseconds between requests; avoid blasting the server too hard

        private const bool COALESCE_SPACES = true;


        private static Encoding _pageEncoding = Encoding.GetEncoding(1252); // depending on your input file

        private static WebClient _downloader = new WebClient();

        private static Site _site = null;

        private static readonly Dictionary<string, bool> _checkedWords = new Dictionary<string, bool>();

        private static readonly List<string> _letTheseLinksLive = new List<string>(); // any words not to be removed


        private static void Main(string[] args)
        {
            Console.WriteLine("Enter password for " + USER + ": ");
            string password = Console.ReadLine();
            Console.Clear();

            StartBot(USER, password);
        }


        private static void StartBot(string user, string password)
        {
            _site = new Site("http://en.wiktionary.org", user, password);

            string source = File.ReadAllText(FILE_INPUT, _pageEncoding);

            // Look for any string inside [[...]], except where the | splitter is involved.

            source = Regex.Replace(source, @"\[\[[^\]\|]*\]\]", BlueLinkEvaluator);

            if (COALESCE_SPACES)
            {
                source = Regex.Replace(source, " +", " ");
            }

            File.WriteAllText(FILE_INPUT, source, _pageEncoding);

            Console.WriteLine();
            Console.WriteLine("Press Enter to exit.");
            Console.ReadLine();
        }


        private static string BlueLinkEvaluator(Match m)
        {
            string wordWithoutBrackets = m.Value.Substring("[[".Length, m.Value.Length - "[[]]".Length);

            if (_letTheseLinksLive.Contains(wordWithoutBrackets))
            {
                return m.Value;
            }
            else if (!_checkedWords.ContainsKey(wordWithoutBrackets))
            {
                // We haven't seen this word yet, so we need to check its existence on Wiktionary.
                // Assume that "==LANGUAGE_NAME==" in the page source means we have found an existing entry.

                Console.WriteLine(wordWithoutBrackets + "...");

                Page p = new Page(_site, wordWithoutBrackets);
                p.Load();

                _checkedWords[wordWithoutBrackets] = (p.text == null || p.text.Contains("==" + LANGUAGE_NAME + "=="));

                Thread.Sleep(SLEEPY_TIME);
            }

            return _checkedWords[wordWithoutBrackets] ? String.Empty : m.Value;
        }
    }
}