User:Dan Polansky/CFI.py
Jump to navigation
Jump to search
""" This is a dated code for a dated CFI.
Issues:
* Missing constructed languages
* Missing company names (I don't care)
* Etc.
"""
def shouldBeIncluded(term, sense):
""" Determines whether a term should be included in English
Wiktionary in a given sense. The sense knows the language and part of speech.
Two of the "not X or Y" forms below are encoding of "if X then Y" AKA "X => Y".
"""
return isAttested(term, sense) and
(not fromFictionalUniverse(term, sense) or meetsFictionalUniverseCriteria(term, sense)) and
(not isProperName(term,sense) or shouldProperNameBeIncludedGivenAttestation(term, sense)) and
(isProperName(term,sense) or not isSemanticSumOfParts(term, sense))
def isAttested(term, sense):
return isInWidespreadUse(term, sense) or
#isInOneWellknownWork(term, sense) or # Deleted via vote
hasThreeSuitableQuotations(term, sense) or
( sense.getLanguage().isExtict() and hasOneContemporaneousQuotation(term, sense))
def hasThreeSuitableQuotations(term, sense):
# The current implementation is extremely resource-wasting.
allQuotations = Corpus.getAllQuotations(term) # A call of a super-mighty method
candidateQuotations = []
# Filter all quotations by the requirements of being durably archived and used rather than mentioned
for quotation in allQuotations:
if isFromDurablyArchivedSource(quotation) and # Broken; a string alone does not tell its source.
isUseRatherThanMention(quotation, term, sense) and
quotation.getLanguage() == sense.getLanguage():
candidateQuotations.append(quotation)
# Remove dependent quotations
for quotation in candidateQuotations:
for quotation2 in candidateQuotations:
if quotation2 == quotation1:
break # Loop only up to the same quotation, checking only the previous ones.
if isDependentQuotation(quotation, quotation2):
candidateQuotations.remove(quotation)
break
# Determine time span
if getTimeSpanInYears(candidateQuotations) < 1.0:
return False
return candidateQuotations.size() >= 3 # At least three suitable quotations
def isSemanticSumOfParts(term, sense):
if hasNoSpaceAndNoHyphen(term):
return True # This may be controversial for German.
else:
# It is a multi-word term.
compoundSemantics = Semantics()
for component in termComponents(term):
compoundSemantics.expand(component.getSense())
return sense.getSemantics() == compoundSemantics
# The above is simplified but seems to capture the gist
def shouldProperNameBeIncludedGivenAttestation(term, sense):
# Specific cases
if isGivenName(term, sense) or isSurname(term, sense) or isPatronymic(term, sense):
return True
if isBrandOfProductOrService(term, sense):
return shouldBrandOfProductOrServiceBeIncluded(term, sense)
# The general case
return True # Dummy return value; the general case is unimplemented for lacking consensus.
def shouldBrandOfProductOrServiceBeIncluded(term, sense):
return False # Dummy return value; complex criteria that largely lead to exclusion.
def isDependentQuotation(quotation, quotation2):
return verbatimOrNearVerbatimQuotation(quotation, quotation2) or
verbatimOrNearVerbatimQuotationOfSingleOrigSource(quotation, quotation2) or
quotation.getAuthor() == quotation2.getAuthor()
def isUseRatherThanMention(quotation, term, sense):
return not termInLeftHandPartOfDef(term, quotation) and
not re.match(".*is called " + term, quotation) and
not re.match(".*" + term + " is a name", quotation) and
not False # Dummy: the rest