Ling / tasks /pos_tagging.py
Nam Fam
update files
ea99abb
from typing import Dict, List, Union, Optional
from llms import LLM
import json
import re
def pos_tagging(
text: str,
model: str = "en_core_web_sm",
use_llm: bool = False,
custom_instructions: str = ""
) -> Dict[str, List[Union[str, List[str]]]]:
"""
Perform Part-of-Speech tagging on the input text using either LLM or traditional models.
Args:
text: The input text to tag
model: The model to use for tagging (e.g., 'en_core_web_sm', 'gpt-4', 'gemini-pro')
use_llm: Whether to use LLM for more accurate but slower POS tagging
custom_instructions: Custom instructions for LLM-based tagging
Returns:
A dictionary containing 'tokens' and 'tags' lists
"""
if not text.strip():
return {"tokens": [], "tags": []}
if use_llm:
return _pos_tagging_with_llm(text, model, custom_instructions)
else:
return _pos_tagging_traditional(text, model)
def _extract_json_array(text: str) -> str:
"""Extract JSON array from text, handling various formats."""
import re
# Try to find JSON array pattern
json_match = re.search(r'\[\s*\{.*\}\s*\]', text, re.DOTALL)
if json_match:
return json_match.group(0)
# If not found, try to find array between square brackets
start = text.find('[')
end = text.rfind(']')
if start >= 0 and end > start:
return text[start:end+1]
return text
def _pos_tagging_with_llm(
text: str,
model_name: str,
custom_instructions: str = ""
) -> Dict[str, List[str]]:
"""Use LLM for more accurate and flexible POS tagging."""
# Create the prompt with clear instructions
prompt = """Analyze the following text and provide Part-of-Speech (POS) tags for each token.
Return the result as a JSON array of objects with 'token' and 'tag' keys.
Use standard Universal Dependencies POS tags:
- ADJ: adjective
- ADP: adposition
- ADV: adverb
- AUX: auxiliary verb
- CONJ: coordinating conjunction
- DET: determiner
- INTJ: interjection
- NOUN: noun
- NUM: numeral
- PART: particle
- PRON: pronoun
- PROPN: proper noun
- PUNCT: punctuation
- SCONJ: subordinating conjunction
- SYM: symbol
- VERB: verb
- X: other
Example output format:
[
{"token": "Hello", "tag": "INTJ"},
{"token": "world", "tag": "NOUN"},
{"token": ".", "tag": "PUNCT"}
]
Text to analyze:
"""
if custom_instructions:
prompt = f"{custom_instructions}\n\n{prompt}"
prompt += f'"{text}"'
try:
# Initialize LLM with lower temperature for more deterministic output
llm = LLM(model=model_name, temperature=0.1, max_tokens=2000)
# Get response from LLM
response = llm.generate(prompt)
print(f"LLM Raw Response: {response[:500]}...") # Log first 500 chars
if not response.strip():
raise ValueError("Empty response from LLM")
# Extract JSON array from response
json_str = _extract_json_array(response)
if not json_str:
raise ValueError("No JSON array found in response")
# Parse the JSON
try:
pos_tags = json.loads(json_str)
except json.JSONDecodeError as e:
# Try to fix common JSON issues
json_str = json_str.replace("'", '"')
json_str = re.sub(r'(\w+):', r'"\1":', json_str) # Add quotes around keys
pos_tags = json.loads(json_str)
# Validate and extract tokens and tags
if not isinstance(pos_tags, list):
raise ValueError(f"Expected list, got {type(pos_tags).__name__}")
tokens = []
tags = []
for item in pos_tags:
if not isinstance(item, dict):
continue
token = item.get('token', '')
tag = item.get('tag', '')
if token and tag: # Only add if both token and tag are non-empty
tokens.append(str(token).strip())
tags.append(str(tag).strip())
if not tokens or not tags:
raise ValueError("No valid tokens and tags found in response")
return {
'tokens': tokens,
'tags': tags
}
except Exception as e:
print(f"Error in LLM POS tagging: {str(e)}")
print(f"Falling back to traditional POS tagging...")
return _pos_tagging_traditional(text, "en_core_web_sm")
def _pos_tagging_traditional(text: str, model: str) -> Dict[str, List[str]]:
"""Use traditional POS tagging models."""
try:
import spacy
# Load the appropriate model
try:
nlp = spacy.load(model)
except OSError:
# Fallback to small English model if specified model is not found
nlp = spacy.load("en_core_web_sm")
# Process the text
doc = nlp(text)
# Extract tokens and POS tags
tokens = []
tags = []
for token in doc:
tokens.append(token.text)
tags.append(token.pos_)
return {
'tokens': tokens,
'tags': tags
}
except Exception as e:
print(f"Error in traditional POS tagging: {str(e)}")
return {"tokens": [], "tags": []}