Examples
========

Real-world examples of using Crosstem.

Text Preprocessing
------------------

Normalize documents for analysis::

   from crosstem import DerivationalStemmer
   
   def preprocess_document(text, language='eng'):
       """Normalize document by stemming to linguistic roots."""
       stemmer = DerivationalStemmer(language)
       
       words = text.lower().split()
       stems = [stemmer.stem(word) for word in words]
       
       return ' '.join(stems)
   
   # Example
   text = "The organization is organizing a conference for organizers"
   normalized = preprocess_document(text)
   print(normalized)
   # Output: "the organize is organize a conference for organize"

Information Retrieval
---------------------

Expand search queries with word families::

   from crossstem import DerivationalStemmer
   
   def expand_query(query, language='eng'):
       """Expand search query with all derivational forms."""
       stemmer = DerivationalStemmer(language)
       
       # Stem the query to find root
       root = stemmer.stem(query)
       
       # Get all words in the family
       family = stemmer.get_word_family(root)
       
       return sorted(family)
   
   # Example
   variants = expand_query('organize')
   print(f"Searching for: {', '.join(variants[:10])}...")
   # Searching for: disorganization, disorganize, organ, 
   # organic, organism, organization, organizational, ...

Document Similarity
-------------------

Compare documents using morphological roots::

   from crosstem import DerivationalStemmer
   from collections import Counter
   
   def document_similarity(doc1, doc2, language='eng'):
       """Calculate similarity based on shared roots."""
       stemmer = DerivationalStemmer(language)
       
       # Stem both documents
       roots1 = [stemmer.stem(w.lower()) for w in doc1.split()]
       roots2 = [stemmer.stem(w.lower()) for w in doc2.split()]
       
       # Count roots
       counter1 = Counter(roots1)
       counter2 = Counter(roots2)
       
       # Calculate overlap
       shared = set(counter1.keys()) & set(counter2.keys())
       total = len(set(counter1.keys()) | set(counter2.keys()))
       
       return len(shared) / total if total > 0 else 0
   
   # Example
   doc1 = "The organization organized an organizational meeting"
   doc2 = "We need to organize and create an organization"
   similarity = document_similarity(doc1, doc2)
   print(f"Similarity: {similarity:.2%}")  # ~71%

Topic Modeling Preprocessing
-----------------------------

Prepare text for topic modeling::

   from crosstem import DerivationalStemmer
   
   class MorphologicalTokenizer:
       def __init__(self, language='eng', min_length=3):
           self.stemmer = DerivationalStemmer(language)
           self.min_length = min_length
       
       def tokenize(self, text):
           """Tokenize and stem to roots."""
           words = text.lower().split()
           stems = []
           
           for word in words:
               # Remove punctuation
               word = ''.join(c for c in word if c.isalnum())
               
               if len(word) >= self.min_length:
                   stem = self.stemmer.stem(word)
                   stems.append(stem)
           
           return stems
   
   # Example with sklearn
   from sklearn.feature_extraction.text import CountVectorizer
   
   tokenizer = MorphologicalTokenizer()
   vectorizer = CountVectorizer(tokenizer=tokenizer.tokenize)
   
   documents = [
       "The organization organized a meeting",
       "Organizers are organizing the event",
       "She works for an organizational consultancy"
   ]
   
   X = vectorizer.fit_transform(documents)
   print(vectorizer.get_feature_names_out())
   # ['consultancy', 'event', 'meet', 'organize', 'she', 'work']

Historical Linguistics
----------------------

Track word evolution across languages::

   from crosstem import EtymologyLinker, download_etymology
   
   # Download etymology data (one-time)
   if not is_etymology_downloaded():
       download_etymology()
   
   def trace_word_origin(word, start_lang='English', max_depth=5):
       """Trace etymology back through ancestor languages."""
       linker = EtymologyLinker()
       
       chain = [(start_lang, word)]
       current_lang = start_lang
       current_word = word
       
       for _ in range(max_depth):
           etymology = linker.get_etymology(current_lang, current_word)
           
           if not etymology or 'INHERITED_FROM' not in etymology:
               break
           
           # Follow inheritance chain
           inherited = etymology['INHERITED_FROM'][0]
           chain.append((inherited['language'], inherited['word']))
           current_lang = inherited['language']
           current_word = inherited['word']
       
       return chain
   
   # Example
   origin_chain = trace_word_origin('organize')
   for lang, word in origin_chain:
       print(f"{lang}: {word}")
   # English: organize
   # Middle English: organisen
   # Old French: organiser
   # Late Latin: organizare
   # ...

Corpus Analysis
---------------

Analyze word relationships in a corpus::

   from crosstem import DerivationalStemmer
   from collections import defaultdict
   
   def analyze_word_families(corpus, language='eng', top_n=10):
       """Find most productive word families in corpus."""
       stemmer = DerivationalStemmer(language)
       
       # Group words by root
       families = defaultdict(set)
       
       for word in corpus:
           root = stemmer.stem(word.lower())
           families[root].add(word.lower())
       
       # Sort by family size
       sorted_families = sorted(
           families.items(),
           key=lambda x: len(x[1]),
           reverse=True
       )
       
       # Return top N
       return sorted_families[:top_n]
   
   # Example
   corpus = [
       'organize', 'organization', 'organizational', 'organizer',
       'organizing', 'reorganize', 'disorganize', 'beauty',
       'beautiful', 'beautifully', 'beautify', 'run', 'running',
       'runner', 'ran', 'rerun'
   ]
   
   top_families = analyze_word_families(corpus, top_n=3)
   for root, members in top_families:
       print(f"{root}: {len(members)} variants - {sorted(members)}")
   # organize: 7 variants - [disorganize, organization, ...]
   # beauty: 4 variants - [beautify, beautiful, ...]
   # run: 5 variants - [ran, rerun, run, runner, running]

Multilingual Processing
-----------------------

Process documents in multiple languages::

   from crossstem import DerivationalStemmer
   
   class MultilingualStemmer:
       def __init__(self, languages):
           self.stemmers = {
               lang: DerivationalStemmer(lang)
               for lang in languages
           }
       
       def stem(self, word, language):
           """Stem word in specified language."""
           if language not in self.stemmers:
               raise ValueError(f"Unsupported language: {language}")
           return self.stemmers[language].stem(word)
       
       def stem_document(self, document, language):
           """Stem entire document."""
           words = document.split()
           return [self.stem(word, language) for word in words]
   
   # Example
   stemmer = MultilingualStemmer(['eng', 'fra', 'deu'])
   
   # English
   print(stemmer.stem('organization', 'eng'))  # organize
   
   # French
   print(stemmer.stem('organisation', 'fra'))  # organiser
   
   # German
   print(stemmer.stem('Organisation', 'deu'))  # organisieren

Named Entity Recognition
-------------------------

Normalize entity variations::

   from crosstem import DerivationalStemmer
   
   def normalize_entities(entities, language='eng'):
       """Normalize entity mentions to canonical forms."""
       stemmer = DerivationalStemmer(language)
       
       normalized = {}
       for entity in entities:
           words = entity.split()
           roots = [stemmer.stem(w.lower()) for w in words]
           canonical = ' '.join(roots)
           
           if canonical not in normalized:
               normalized[canonical] = []
           normalized[canonical].append(entity)
       
       return normalized
   
   # Example
   entities = [
       "United Nations Organization",
       "UN Organization",
       "Organizational Structure",
       "Organizing Committee"
   ]
   
   grouped = normalize_entities(entities)
   for canonical, variants in grouped.items():
       print(f"{canonical}: {variants}")

Question Answering
------------------

Improve QA by matching word roots::

   from crosstem import DerivationalStemmer
   
   def find_relevant_passages(question, passages, language='eng'):
       """Find passages relevant to question using root matching."""
       stemmer = DerivationalStemmer(language)
       
       # Stem question
       q_roots = set(stemmer.stem(w.lower()) for w in question.split())
       
       # Score passages
       scored = []
       for passage in passages:
           p_roots = set(stemmer.stem(w.lower()) for w in passage.split())
           overlap = len(q_roots & p_roots)
           scored.append((overlap, passage))
       
       # Return sorted by relevance
       scored.sort(reverse=True, key=lambda x: x[0])
       return [passage for _, passage in scored]
   
   # Example
   question = "How do organizations organize their structure?"
   
   passages = [
       "Companies use organizational charts to show structure.",
       "The meeting was well organized by the committee.",
       "Trees provide shade in the summer."
   ]
   
   relevant = find_relevant_passages(question, passages)
   print("Most relevant:", relevant[0])
   # "Companies use organizational charts to show structure."

Text Classification Features
-----------------------------

Generate morphological features::

   from crossstem import DerivationalStemmer
   from sklearn.feature_extraction.text import CountVectorizer
   
   class MorphologicalVectorizer:
       def __init__(self, language='eng'):
           self.stemmer = DerivationalStemmer(language)
           self.vectorizer = CountVectorizer()
       
       def fit_transform(self, documents):
           # Stem all documents
           stemmed_docs = [
               ' '.join(self.stemmer.stem(w) for w in doc.split())
               for doc in documents
           ]
           return self.vectorizer.fit_transform(stemmed_docs)
       
       def transform(self, documents):
           stemmed_docs = [
               ' '.join(self.stemmer.stem(w) for w in doc.split())
               for doc in documents
           ]
           return self.vectorizer.transform(stemmed_docs)
   
   # Example with classification
   from sklearn.naive_bayes import MultinomialNB
   
   vectorizer = MorphologicalVectorizer()
   classifier = MultinomialNB()
   
   X_train = vectorizer.fit_transform(train_documents)
   classifier.fit(X_train, train_labels)
   
   X_test = vectorizer.transform(test_documents)
   predictions = classifier.predict(X_test)