User:Equinox/code/Antiblue
Jump to navigation
Jump to search
using System;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using DotNetWikiBot;
namespace Antiblue
{
/// <summary>
/// Removes the unadorned "blue links" from a Wiktionary text.
/// </summary>
/// <remarks>
/// This is pretty slow because it checks each (unique) word in real time, rather than using a downloaded dump.
/// However, things change fast, and using an outdated dump is always liable to miss things.
/// </remarks>
class Program
{
private const string FILE_INPUT = @"C:\Users\home\Desktop\input.txt";
private const string USER = "Equinox";
private const string LANGUAGE_NAME = "English"; // null to remove existing links in any language
private const int SLEEPY_TIME = 1000; // milliseconds between requests; avoid blasting the server too hard
private const bool COALESCE_SPACES = true;
private static Encoding _pageEncoding = Encoding.GetEncoding(1252); // depending on your input file
private static WebClient _downloader = new WebClient();
private static Site _site = null;
private static readonly Dictionary<string, bool> _checkedWords = new Dictionary<string, bool>();
private static readonly List<string> _letTheseLinksLive = new List<string>(); // any words not to be removed
private static void Main(string[] args)
{
ServicePointManager.SecurityProtocol = (SecurityProtocolType) 3072; // TLS 1.2
Console.WriteLine("CONFIRM BEFORE USE: Language to check: " + (LANGUAGE_NAME ?? "(any language)"));
Console.WriteLine("Enter password for " + USER + ": ");
string password = Console.ReadLine();
Console.Clear();
StartBot(USER, password);
}
private static void StartBot(string user, string password)
{
_site = new Site("https://en.wiktionary.org", user, password);
string source = File.ReadAllText(FILE_INPUT, _pageEncoding);
// Look for any string inside [[...]], except where the | splitter is involved.
source = Regex.Replace(source, @"\[\[[^\]\|]*\]\]", BlueLinkEvaluator);
if (COALESCE_SPACES)
{
source = Regex.Replace(source, " +", " ");
}
File.WriteAllText(FILE_INPUT, source, _pageEncoding);
Console.WriteLine();
Console.WriteLine("Press Enter to exit.");
Console.ReadLine();
}
private static string BlueLinkEvaluator(Match m)
{
string wordWithoutBrackets = m.Value.Substring("[[".Length, m.Value.Length - "[[]]".Length);
if (_letTheseLinksLive.Contains(wordWithoutBrackets))
{
return m.Value;
}
else if (!_checkedWords.ContainsKey(wordWithoutBrackets))
{
// We haven't seen this word yet, so we need to check its existence on Wiktionary.
// Assume that "==LANGUAGE_NAME==" in the page source means we have found an existing entry.
Console.WriteLine(wordWithoutBrackets + "...");
Page p = new Page(_site, wordWithoutBrackets);
p.Load();
if (LANGUAGE_NAME == null)
{
_checkedWords[wordWithoutBrackets] = !String.IsNullOrEmpty(p.text);
}
else // specific language
{
_checkedWords[wordWithoutBrackets] = (p.text != null && p.text.Contains("==" + LANGUAGE_NAME + "=="));
}
Thread.Sleep(SLEEPY_TIME);
}
return _checkedWords[wordWithoutBrackets] ? String.Empty : m.Value;
}
}
}