Examples ======== Real-world examples of using Crosstem. Text Preprocessing ------------------ Normalize documents for analysis:: from crosstem import DerivationalStemmer def preprocess_document(text, language='eng'): """Normalize document by stemming to linguistic roots.""" stemmer = DerivationalStemmer(language) words = text.lower().split() stems = [stemmer.stem(word) for word in words] return ' '.join(stems) # Example text = "The organization is organizing a conference for organizers" normalized = preprocess_document(text) print(normalized) # Output: "the organize is organize a conference for organize" Information Retrieval --------------------- Expand search queries with word families:: from crossstem import DerivationalStemmer def expand_query(query, language='eng'): """Expand search query with all derivational forms.""" stemmer = DerivationalStemmer(language) # Stem the query to find root root = stemmer.stem(query) # Get all words in the family family = stemmer.get_word_family(root) return sorted(family) # Example variants = expand_query('organize') print(f"Searching for: {', '.join(variants[:10])}...") # Searching for: disorganization, disorganize, organ, # organic, organism, organization, organizational, ... Document Similarity ------------------- Compare documents using morphological roots:: from crosstem import DerivationalStemmer from collections import Counter def document_similarity(doc1, doc2, language='eng'): """Calculate similarity based on shared roots.""" stemmer = DerivationalStemmer(language) # Stem both documents roots1 = [stemmer.stem(w.lower()) for w in doc1.split()] roots2 = [stemmer.stem(w.lower()) for w in doc2.split()] # Count roots counter1 = Counter(roots1) counter2 = Counter(roots2) # Calculate overlap shared = set(counter1.keys()) & set(counter2.keys()) total = len(set(counter1.keys()) | set(counter2.keys())) return len(shared) / total if total > 0 else 0 # Example doc1 = "The organization organized an organizational meeting" doc2 = "We need to organize and create an organization" similarity = document_similarity(doc1, doc2) print(f"Similarity: {similarity:.2%}") # ~71% Topic Modeling Preprocessing ----------------------------- Prepare text for topic modeling:: from crosstem import DerivationalStemmer class MorphologicalTokenizer: def __init__(self, language='eng', min_length=3): self.stemmer = DerivationalStemmer(language) self.min_length = min_length def tokenize(self, text): """Tokenize and stem to roots.""" words = text.lower().split() stems = [] for word in words: # Remove punctuation word = ''.join(c for c in word if c.isalnum()) if len(word) >= self.min_length: stem = self.stemmer.stem(word) stems.append(stem) return stems # Example with sklearn from sklearn.feature_extraction.text import CountVectorizer tokenizer = MorphologicalTokenizer() vectorizer = CountVectorizer(tokenizer=tokenizer.tokenize) documents = [ "The organization organized a meeting", "Organizers are organizing the event", "She works for an organizational consultancy" ] X = vectorizer.fit_transform(documents) print(vectorizer.get_feature_names_out()) # ['consultancy', 'event', 'meet', 'organize', 'she', 'work'] Historical Linguistics ---------------------- Track word evolution across languages:: from crosstem import EtymologyLinker, download_etymology # Download etymology data (one-time) if not is_etymology_downloaded(): download_etymology() def trace_word_origin(word, start_lang='English', max_depth=5): """Trace etymology back through ancestor languages.""" linker = EtymologyLinker() chain = [(start_lang, word)] current_lang = start_lang current_word = word for _ in range(max_depth): etymology = linker.get_etymology(current_lang, current_word) if not etymology or 'INHERITED_FROM' not in etymology: break # Follow inheritance chain inherited = etymology['INHERITED_FROM'][0] chain.append((inherited['language'], inherited['word'])) current_lang = inherited['language'] current_word = inherited['word'] return chain # Example origin_chain = trace_word_origin('organize') for lang, word in origin_chain: print(f"{lang}: {word}") # English: organize # Middle English: organisen # Old French: organiser # Late Latin: organizare # ... Corpus Analysis --------------- Analyze word relationships in a corpus:: from crosstem import DerivationalStemmer from collections import defaultdict def analyze_word_families(corpus, language='eng', top_n=10): """Find most productive word families in corpus.""" stemmer = DerivationalStemmer(language) # Group words by root families = defaultdict(set) for word in corpus: root = stemmer.stem(word.lower()) families[root].add(word.lower()) # Sort by family size sorted_families = sorted( families.items(), key=lambda x: len(x[1]), reverse=True ) # Return top N return sorted_families[:top_n] # Example corpus = [ 'organize', 'organization', 'organizational', 'organizer', 'organizing', 'reorganize', 'disorganize', 'beauty', 'beautiful', 'beautifully', 'beautify', 'run', 'running', 'runner', 'ran', 'rerun' ] top_families = analyze_word_families(corpus, top_n=3) for root, members in top_families: print(f"{root}: {len(members)} variants - {sorted(members)}") # organize: 7 variants - [disorganize, organization, ...] # beauty: 4 variants - [beautify, beautiful, ...] # run: 5 variants - [ran, rerun, run, runner, running] Multilingual Processing ----------------------- Process documents in multiple languages:: from crossstem import DerivationalStemmer class MultilingualStemmer: def __init__(self, languages): self.stemmers = { lang: DerivationalStemmer(lang) for lang in languages } def stem(self, word, language): """Stem word in specified language.""" if language not in self.stemmers: raise ValueError(f"Unsupported language: {language}") return self.stemmers[language].stem(word) def stem_document(self, document, language): """Stem entire document.""" words = document.split() return [self.stem(word, language) for word in words] # Example stemmer = MultilingualStemmer(['eng', 'fra', 'deu']) # English print(stemmer.stem('organization', 'eng')) # organize # French print(stemmer.stem('organisation', 'fra')) # organiser # German print(stemmer.stem('Organisation', 'deu')) # organisieren Named Entity Recognition ------------------------- Normalize entity variations:: from crosstem import DerivationalStemmer def normalize_entities(entities, language='eng'): """Normalize entity mentions to canonical forms.""" stemmer = DerivationalStemmer(language) normalized = {} for entity in entities: words = entity.split() roots = [stemmer.stem(w.lower()) for w in words] canonical = ' '.join(roots) if canonical not in normalized: normalized[canonical] = [] normalized[canonical].append(entity) return normalized # Example entities = [ "United Nations Organization", "UN Organization", "Organizational Structure", "Organizing Committee" ] grouped = normalize_entities(entities) for canonical, variants in grouped.items(): print(f"{canonical}: {variants}") Question Answering ------------------ Improve QA by matching word roots:: from crosstem import DerivationalStemmer def find_relevant_passages(question, passages, language='eng'): """Find passages relevant to question using root matching.""" stemmer = DerivationalStemmer(language) # Stem question q_roots = set(stemmer.stem(w.lower()) for w in question.split()) # Score passages scored = [] for passage in passages: p_roots = set(stemmer.stem(w.lower()) for w in passage.split()) overlap = len(q_roots & p_roots) scored.append((overlap, passage)) # Return sorted by relevance scored.sort(reverse=True, key=lambda x: x[0]) return [passage for _, passage in scored] # Example question = "How do organizations organize their structure?" passages = [ "Companies use organizational charts to show structure.", "The meeting was well organized by the committee.", "Trees provide shade in the summer." ] relevant = find_relevant_passages(question, passages) print("Most relevant:", relevant[0]) # "Companies use organizational charts to show structure." Text Classification Features ----------------------------- Generate morphological features:: from crossstem import DerivationalStemmer from sklearn.feature_extraction.text import CountVectorizer class MorphologicalVectorizer: def __init__(self, language='eng'): self.stemmer = DerivationalStemmer(language) self.vectorizer = CountVectorizer() def fit_transform(self, documents): # Stem all documents stemmed_docs = [ ' '.join(self.stemmer.stem(w) for w in doc.split()) for doc in documents ] return self.vectorizer.fit_transform(stemmed_docs) def transform(self, documents): stemmed_docs = [ ' '.join(self.stemmer.stem(w) for w in doc.split()) for doc in documents ] return self.vectorizer.transform(stemmed_docs) # Example with classification from sklearn.naive_bayes import MultinomialNB vectorizer = MorphologicalVectorizer() classifier = MultinomialNB() X_train = vectorizer.fit_transform(train_documents) classifier.fit(X_train, train_labels) X_test = vectorizer.transform(test_documents) predictions = classifier.predict(X_test)