Spaces:

namfam
/

Ling

Running

File size: 5,416 Bytes

ea99abb

from typing import Dict, List, Union, Optional
from llms import LLM
import json
import re

def pos_tagging(
    text: str, 
    model: str = "en_core_web_sm",
    use_llm: bool = False,
    custom_instructions: str = ""
) -> Dict[str, List[Union[str, List[str]]]]:
    """
    Perform Part-of-Speech tagging on the input text using either LLM or traditional models.
    
    Args:
        text: The input text to tag
        model: The model to use for tagging (e.g., 'en_core_web_sm', 'gpt-4', 'gemini-pro')
        use_llm: Whether to use LLM for more accurate but slower POS tagging
        custom_instructions: Custom instructions for LLM-based tagging
        
    Returns:
        A dictionary containing 'tokens' and 'tags' lists
    """
    if not text.strip():
        return {"tokens": [], "tags": []}
        
    if use_llm:
        return _pos_tagging_with_llm(text, model, custom_instructions)
    else:
        return _pos_tagging_traditional(text, model)

def _extract_json_array(text: str) -> str:
    """Extract JSON array from text, handling various formats."""
    import re
    
    # Try to find JSON array pattern
    json_match = re.search(r'\[\s*\{.*\}\s*\]', text, re.DOTALL)
    if json_match:
        return json_match.group(0)
    
    # If not found, try to find array between square brackets
    start = text.find('[')
    end = text.rfind(']')
    if start >= 0 and end > start:
        return text[start:end+1]
    
    return text

def _pos_tagging_with_llm(
    text: str,
    model_name: str,
    custom_instructions: str = ""
) -> Dict[str, List[str]]:
    """Use LLM for more accurate and flexible POS tagging."""
    # Create the prompt with clear instructions
    prompt = """Analyze the following text and provide Part-of-Speech (POS) tags for each token.
Return the result as a JSON array of objects with 'token' and 'tag' keys.

Use standard Universal Dependencies POS tags:
- ADJ: adjective
- ADP: adposition
- ADV: adverb
- AUX: auxiliary verb
- CONJ: coordinating conjunction
- DET: determiner
- INTJ: interjection
- NOUN: noun
- NUM: numeral
- PART: particle
- PRON: pronoun
- PROPN: proper noun
- PUNCT: punctuation
- SCONJ: subordinating conjunction
- SYM: symbol
- VERB: verb
- X: other

Example output format:
[
  {"token": "Hello", "tag": "INTJ"},
  {"token": "world", "tag": "NOUN"},
  {"token": ".", "tag": "PUNCT"}
]

Text to analyze:
"""
    
    if custom_instructions:
        prompt = f"{custom_instructions}\n\n{prompt}"
    
    prompt += f'"{text}"'
    
    try:
        # Initialize LLM with lower temperature for more deterministic output
        llm = LLM(model=model_name, temperature=0.1, max_tokens=2000)
        
        # Get response from LLM
        response = llm.generate(prompt)
        print(f"LLM Raw Response: {response[:500]}...")  # Log first 500 chars
        
        if not response.strip():
            raise ValueError("Empty response from LLM")
            
        # Extract JSON array from response
        json_str = _extract_json_array(response)
        if not json_str:
            raise ValueError("No JSON array found in response")
            
        # Parse the JSON
        try:
            pos_tags = json.loads(json_str)
        except json.JSONDecodeError as e:
            # Try to fix common JSON issues
            json_str = json_str.replace("'", '"')
            json_str = re.sub(r'(\w+):', r'"\1":', json_str)  # Add quotes around keys
            pos_tags = json.loads(json_str)
        
        # Validate and extract tokens and tags
        if not isinstance(pos_tags, list):
            raise ValueError(f"Expected list, got {type(pos_tags).__name__}")
            
        tokens = []
        tags = []
        
        for item in pos_tags:
            if not isinstance(item, dict):
                continue
                
            token = item.get('token', '')
            tag = item.get('tag', '')
            
            if token and tag:  # Only add if both token and tag are non-empty
                tokens.append(str(token).strip())
                tags.append(str(tag).strip())
        
        if not tokens or not tags:
            raise ValueError("No valid tokens and tags found in response")
            
        return {
            'tokens': tokens,
            'tags': tags
        }
        
    except Exception as e:
        print(f"Error in LLM POS tagging: {str(e)}")
        print(f"Falling back to traditional POS tagging...")
        return _pos_tagging_traditional(text, "en_core_web_sm")

def _pos_tagging_traditional(text: str, model: str) -> Dict[str, List[str]]:
    """Use traditional POS tagging models."""
    try:
        import spacy
        
        # Load the appropriate model
        try:
            nlp = spacy.load(model)
        except OSError:
            # Fallback to small English model if specified model is not found
            nlp = spacy.load("en_core_web_sm")
            
        # Process the text
        doc = nlp(text)
        
        # Extract tokens and POS tags
        tokens = []
        tags = []
        for token in doc:
            tokens.append(token.text)
            tags.append(token.pos_)
            
        return {
            'tokens': tokens,
            'tags': tags
        }
        
    except Exception as e:
        print(f"Error in traditional POS tagging: {str(e)}")
        return {"tokens": [], "tags": []}