Examples
Real-world examples of using Crosstem.
Text Preprocessing
Normalize documents for analysis:
from crosstem import DerivationalStemmer
def preprocess_document(text, language='eng'):
"""Normalize document by stemming to linguistic roots."""
stemmer = DerivationalStemmer(language)
words = text.lower().split()
stems = [stemmer.stem(word) for word in words]
return ' '.join(stems)
# Example
text = "The organization is organizing a conference for organizers"
normalized = preprocess_document(text)
print(normalized)
# Output: "the organize is organize a conference for organize"
Information Retrieval
Expand search queries with word families:
from crossstem import DerivationalStemmer
def expand_query(query, language='eng'):
"""Expand search query with all derivational forms."""
stemmer = DerivationalStemmer(language)
# Stem the query to find root
root = stemmer.stem(query)
# Get all words in the family
family = stemmer.get_word_family(root)
return sorted(family)
# Example
variants = expand_query('organize')
print(f"Searching for: {', '.join(variants[:10])}...")
# Searching for: disorganization, disorganize, organ,
# organic, organism, organization, organizational, ...
Document Similarity
Compare documents using morphological roots:
from crosstem import DerivationalStemmer
from collections import Counter
def document_similarity(doc1, doc2, language='eng'):
"""Calculate similarity based on shared roots."""
stemmer = DerivationalStemmer(language)
# Stem both documents
roots1 = [stemmer.stem(w.lower()) for w in doc1.split()]
roots2 = [stemmer.stem(w.lower()) for w in doc2.split()]
# Count roots
counter1 = Counter(roots1)
counter2 = Counter(roots2)
# Calculate overlap
shared = set(counter1.keys()) & set(counter2.keys())
total = len(set(counter1.keys()) | set(counter2.keys()))
return len(shared) / total if total > 0 else 0
# Example
doc1 = "The organization organized an organizational meeting"
doc2 = "We need to organize and create an organization"
similarity = document_similarity(doc1, doc2)
print(f"Similarity: {similarity:.2%}") # ~71%
Topic Modeling Preprocessing
Prepare text for topic modeling:
from crosstem import DerivationalStemmer
class MorphologicalTokenizer:
def __init__(self, language='eng', min_length=3):
self.stemmer = DerivationalStemmer(language)
self.min_length = min_length
def tokenize(self, text):
"""Tokenize and stem to roots."""
words = text.lower().split()
stems = []
for word in words:
# Remove punctuation
word = ''.join(c for c in word if c.isalnum())
if len(word) >= self.min_length:
stem = self.stemmer.stem(word)
stems.append(stem)
return stems
# Example with sklearn
from sklearn.feature_extraction.text import CountVectorizer
tokenizer = MorphologicalTokenizer()
vectorizer = CountVectorizer(tokenizer=tokenizer.tokenize)
documents = [
"The organization organized a meeting",
"Organizers are organizing the event",
"She works for an organizational consultancy"
]
X = vectorizer.fit_transform(documents)
print(vectorizer.get_feature_names_out())
# ['consultancy', 'event', 'meet', 'organize', 'she', 'work']
Historical Linguistics
Track word evolution across languages:
from crosstem import EtymologyLinker, download_etymology
# Download etymology data (one-time)
if not is_etymology_downloaded():
download_etymology()
def trace_word_origin(word, start_lang='English', max_depth=5):
"""Trace etymology back through ancestor languages."""
linker = EtymologyLinker()
chain = [(start_lang, word)]
current_lang = start_lang
current_word = word
for _ in range(max_depth):
etymology = linker.get_etymology(current_lang, current_word)
if not etymology or 'INHERITED_FROM' not in etymology:
break
# Follow inheritance chain
inherited = etymology['INHERITED_FROM'][0]
chain.append((inherited['language'], inherited['word']))
current_lang = inherited['language']
current_word = inherited['word']
return chain
# Example
origin_chain = trace_word_origin('organize')
for lang, word in origin_chain:
print(f"{lang}: {word}")
# English: organize
# Middle English: organisen
# Old French: organiser
# Late Latin: organizare
# ...
Corpus Analysis
Analyze word relationships in a corpus:
from crosstem import DerivationalStemmer
from collections import defaultdict
def analyze_word_families(corpus, language='eng', top_n=10):
"""Find most productive word families in corpus."""
stemmer = DerivationalStemmer(language)
# Group words by root
families = defaultdict(set)
for word in corpus:
root = stemmer.stem(word.lower())
families[root].add(word.lower())
# Sort by family size
sorted_families = sorted(
families.items(),
key=lambda x: len(x[1]),
reverse=True
)
# Return top N
return sorted_families[:top_n]
# Example
corpus = [
'organize', 'organization', 'organizational', 'organizer',
'organizing', 'reorganize', 'disorganize', 'beauty',
'beautiful', 'beautifully', 'beautify', 'run', 'running',
'runner', 'ran', 'rerun'
]
top_families = analyze_word_families(corpus, top_n=3)
for root, members in top_families:
print(f"{root}: {len(members)} variants - {sorted(members)}")
# organize: 7 variants - [disorganize, organization, ...]
# beauty: 4 variants - [beautify, beautiful, ...]
# run: 5 variants - [ran, rerun, run, runner, running]
Multilingual Processing
Process documents in multiple languages:
from crossstem import DerivationalStemmer
class MultilingualStemmer:
def __init__(self, languages):
self.stemmers = {
lang: DerivationalStemmer(lang)
for lang in languages
}
def stem(self, word, language):
"""Stem word in specified language."""
if language not in self.stemmers:
raise ValueError(f"Unsupported language: {language}")
return self.stemmers[language].stem(word)
def stem_document(self, document, language):
"""Stem entire document."""
words = document.split()
return [self.stem(word, language) for word in words]
# Example
stemmer = MultilingualStemmer(['eng', 'fra', 'deu'])
# English
print(stemmer.stem('organization', 'eng')) # organize
# French
print(stemmer.stem('organisation', 'fra')) # organiser
# German
print(stemmer.stem('Organisation', 'deu')) # organisieren
Named Entity Recognition
Normalize entity variations:
from crosstem import DerivationalStemmer
def normalize_entities(entities, language='eng'):
"""Normalize entity mentions to canonical forms."""
stemmer = DerivationalStemmer(language)
normalized = {}
for entity in entities:
words = entity.split()
roots = [stemmer.stem(w.lower()) for w in words]
canonical = ' '.join(roots)
if canonical not in normalized:
normalized[canonical] = []
normalized[canonical].append(entity)
return normalized
# Example
entities = [
"United Nations Organization",
"UN Organization",
"Organizational Structure",
"Organizing Committee"
]
grouped = normalize_entities(entities)
for canonical, variants in grouped.items():
print(f"{canonical}: {variants}")
Question Answering
Improve QA by matching word roots:
from crosstem import DerivationalStemmer
def find_relevant_passages(question, passages, language='eng'):
"""Find passages relevant to question using root matching."""
stemmer = DerivationalStemmer(language)
# Stem question
q_roots = set(stemmer.stem(w.lower()) for w in question.split())
# Score passages
scored = []
for passage in passages:
p_roots = set(stemmer.stem(w.lower()) for w in passage.split())
overlap = len(q_roots & p_roots)
scored.append((overlap, passage))
# Return sorted by relevance
scored.sort(reverse=True, key=lambda x: x[0])
return [passage for _, passage in scored]
# Example
question = "How do organizations organize their structure?"
passages = [
"Companies use organizational charts to show structure.",
"The meeting was well organized by the committee.",
"Trees provide shade in the summer."
]
relevant = find_relevant_passages(question, passages)
print("Most relevant:", relevant[0])
# "Companies use organizational charts to show structure."
Text Classification Features
Generate morphological features:
from crossstem import DerivationalStemmer
from sklearn.feature_extraction.text import CountVectorizer
class MorphologicalVectorizer:
def __init__(self, language='eng'):
self.stemmer = DerivationalStemmer(language)
self.vectorizer = CountVectorizer()
def fit_transform(self, documents):
# Stem all documents
stemmed_docs = [
' '.join(self.stemmer.stem(w) for w in doc.split())
for doc in documents
]
return self.vectorizer.fit_transform(stemmed_docs)
def transform(self, documents):
stemmed_docs = [
' '.join(self.stemmer.stem(w) for w in doc.split())
for doc in documents
]
return self.vectorizer.transform(stemmed_docs)
# Example with classification
from sklearn.naive_bayes import MultinomialNB
vectorizer = MorphologicalVectorizer()
classifier = MultinomialNB()
X_train = vectorizer.fit_transform(train_documents)
classifier.fit(X_train, train_labels)
X_test = vectorizer.transform(test_documents)
predictions = classifier.predict(X_test)