User:MewBot/formbot.py

Definition from Wiktionary, the free dictionary
Jump to: navigation, search
#!/usr/bin/env python
#coding: utf-8
 
# Copyright CodeCat 2010 - 2013
 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
# 
# You should have received a copy of the GNU Lesser General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
import wikipedia, re, string, sys
 
 
class GenericFormBot:
	"""A generic class for Wiktionary form bots.
 
	This class is an abstract base class, and isn't meant to be instantiated
	directly. To use it, derive a new class from it, and override the
	generateForms method with a proper definition, and provide a call to
	the base class constructor.
	Once you're ready to let it run, just call run() and it's all sorted.
 
	The purpose of this script is to provide automated generation of
	Wiktionary entries for inflected forms. It does this by fetching a
	Wiktionary page, then checks for the existence of certain on that page.
	If found, it extracts the necessary information from the template
	parameters, and passes it on to the generateForms method, which generates
	the forms (just as the templates themselves do) and uploads the result as
	new entries.
 
	It will either create a new page or append a new section to the
	page. It will skip the page if it already contains a section of the same
	type as the one being created.
	If the page already exists, it will add {{rfc-auto}} to it,
	so that the AutoFormat bot can automatically place the section in the
	proper place on the page.
	"""
 
	def __init__(self, head, templates, langCode, langName,
		cleanupCat = None, simulation = False, force = False, verbose = False):
 
		self._head = head
		self._templates = templates
		self._langCode = langCode
		self._langName = langName
		self._cleanupCat = cleanupCat
 
		self._simulation = simulation
		self._force = force
		self._verbose = verbose
 
 
	def run(self):
		"""Fetch a wiktionary entry and create entries from information in all form template occurrences."""
 
		page = wikipedia.Page(wikipedia.getSite('en', 'wiktionary'), self._head)
 
		if page.exists():
			contents = page.get()
 
			# Find all occurrences of form templates
			templates = getTemplates(contents, self._templates)
 
			if not templates:
				wikipedia.output(u"No form template on page [[{0}]].".format(self._head))
				return
			else:
				for temp in templates:
					wikipedia.output(u"Found: {0}".format(temp))
					name, params = parseTemplate(temp)
					self.makeEntries(name, params)
		else:
			wikipedia.output(u"Can't find page [[{0}]].".format(self._head))
 
 
	def makeEntries(self, template, params):
		"""Create entries from information in one form template."""
 
		entries = self.generateEntries(template, params)
 
		if not entries:
			return
 
		try:
			del entries[self._head]
		except KeyError:
			pass
 
		result = False
 
		# Merge the lists into a single string per entry
		for title, entry in entries.iteritems():
			changed = self.saveEntry(title, entry)
			result = result or changed
 
		if not result:
			wikipedia.output(u"Note: Did not add any new entries from page [[{0}]].".format(self._head))
 
 
	def zipEntries(self, entries, header):
		"""Return with each entry zipped together into one string."""
		ret = {}
 
		for form, entry in entries.iteritems():
			ret[form] = header + '# ' + '\n# '.join(entry)
 
		return ret
 
 
	def generateEntries(self, template, params):
		"""Override this in a derived class."""
		pass
 
 
	def saveEntry(self, title, entry):
		"""Save a new entry to Wiktionary."""
 
		page = wikipedia.Page(wikipedia.getSite('en', 'wiktionary'), title)
		newContents = '=={0}==\n'.format(self._langName) + entry
 
		if page.exists():
			oldContents = page.get()
 
			if entry in oldContents:
				wikipedia.output(u"Skipped page [[{0}]]. Already contains the new entry.".format(title))
				return False
 
			langSections = getSections(oldContents, self._langName, 2)
			newContents = '\n\n----\n' + newContents
 
			if langSections:
				# There is more than one section for this language already.
				# The bot probably was here before!
				if len(langSections) > 1:
					if self._force:
						wikipedia.output(u"WARNING: Forced append to [[{0}]]. More than one {1} section on page.".format(title, self._langName))
 
						if self._cleanupCat:
							newContents += '\n[[' + self._cleanupCat + ']]'
					else:
						wikipedia.output(u"Skipped page [[{0}]]. More than one {1} section on page.".format(title, self._langName))
						return False
				else:
					# There is a lang section on the page
					langContents = oldContents[langSections[0][0]:langSections[0][1]]
 
					# Does the lang section have numbered etymologies?
					if re.search(ur'=== *Etymology \d+ *===', langContents, re.UNICODE):
						if self._force:
							wikipedia.output(u"WARNING: Forced append to [[{0}]]. {1} section has numbered etymology sections.".format(title, self._langName))
 
							if self._cleanupCat:
								newContents += '\n[[' + self._cleanupCat + ']]'
						else:
							wikipedia.output(u"Skipped page [[{0}]]. {1} section has numbered etymology sections.".format(title, self._langName))
							return False
					else:
						pos = re.match(ur'===(\w+)===', entry, re.UNICODE).group(1)
						posHeaders = [pos, pos + u' form']
 
						# Special case... this happened to me once, so I might as well code it in
						if pos == 'Verb':
							posHeaders.append(u'Participle')
 
						# Does the lang section have a verb section already in it?
						if re.search(ur'=== *(?:{0}) *==='.format(u'|'.join(posHeaders)), langContents, re.UNICODE):
							if self._force:
								wikipedia.output(u"WARNING: Forced append to [[{0}]]. Already has {1} {2} section.".format(title, self._langName, pos))
 
								if self._cleanupCat:
									newContents += '\n[[' + self._cleanupCat + ']]'
							else:
								wikipedia.output(u"Skipped page [[{0}]]. Already has {1} {2} section.".format(title, self._langName, pos))
								return False
						else:
							newContents += '\n{{rfc-auto}}'
			else:
				newContents += '\n{{rfc-auto}}'
 
			if self._simulation:
				wikipedia.output(u"Simulated update to page [[{0}]].".format(title))
			else:
				page.put(oldContents + newContents, comment = u'Auto-generated {0} verb forms - appended'.format(self._langName), minorEdit = False)
		else:
			if self._simulation:
				wikipedia.output(u"Simulated creating page [[{0}]].".format(title))
			else:
				page.put(newContents, comment = u'Auto-generated {0} verb forms'.format(self._langName), minorEdit = True)
 
		if self._verbose:
			wikipedia.output(u"Page [[{0}]] new contents:\n".format(title) + '-' * 60, toStdout = True)
			wikipedia.output(newContents, toStdout = True)
			wikipedia.output('*' * 60, toStdout = True)
 
		return True
 
 
def getTemplates(contents, names):
	"""Get all template calls to a specific set of templates from a page."""
 
	templates = []
	matches = re.finditer(ur'{{\s*((?:' + ur'|'.join(names) + ur').*?)\s*}}', contents, re.UNICODE | re.DOTALL)
 
	for match in matches:
		templates.append(match.group(1))
 
	return templates
 
 
def parseTemplate(template):
	"""Parse and convert parameters of a template into dictionaries."""
 
	template = re.sub(ur'<!--.*?-->', '', template, flags = re.UNICODE | re.DOTALL)
	template = string.split(template, '|')
	templateName = template[0]
 
	params = {}
	paramIndex = 1
 
	for s in template[1:]:
		s = string.split(s, '=', 1)
 
		# The string contains an =
		if len(s) >= 2:
			paramName = string.strip(s[0])
			# Is the name a number?
			try:
				paramName = int(paramName)
			except ValueError:
				pass
 
			paramValue = string.strip(s[1])
 
			if paramValue:
				params[paramName] = paramValue
		else:
			paramValue = string.strip(s[0])
 
			if paramValue:
				params[paramIndex] = paramValue
 
			paramIndex += 1
 
	return templateName, params
 
def makeTemplate(name, params):
	"""Expand a template, given its name and parameters."""
 
	templatestring = u"{{" + name
 
	for key, val in params.iteritems():
		templatestring += u"|" + str(key) + u"=" + val
 
	templatestring += "}}"
 
	return templatestring
 
def getSections(contents, name, level, inclHeader = True):
	"""Get the start and end index of a section of a given name, or return None."""
 
	sectionRegex = ur'({0} *{1} *{0}\s*)(.*?)(?:(?:\n{0} *[^\n=]+ *{0})|$)'.format('=' * level, name)
	matches = re.finditer(sectionRegex, contents, re.DOTALL | re.UNICODE)
 
	if not matches:
		return None
 
	ret = []
 
	for match in matches:
		if inclHeader:
			ret.append((match.start(1), match.end(2)))
		else:
			ret.append((match.start(2), match.end(2)))
 
	return ret