Spaces:

namfam
/

Ling

Running

Ling / tasks /pos_tagging.py

Nam Fam

update files

ea99abb 8 days ago

5.42 kB

	from typing import Dict, List, Union, Optional
	from llms import LLM
	import json
	import re

	def pos_tagging(
	text: str,
	model: str = "en_core_web_sm",
	use_llm: bool = False,
	custom_instructions: str = ""
	) -> Dict[str, List[Union[str, List[str]]]]:
	"""
	Perform Part-of-Speech tagging on the input text using either LLM or traditional models.

	Args:
	text: The input text to tag
	model: The model to use for tagging (e.g., 'en_core_web_sm', 'gpt-4', 'gemini-pro')
	use_llm: Whether to use LLM for more accurate but slower POS tagging
	custom_instructions: Custom instructions for LLM-based tagging

	Returns:
	A dictionary containing 'tokens' and 'tags' lists
	"""
	if not text.strip():
	return {"tokens": [], "tags": []}

	if use_llm:
	return _pos_tagging_with_llm(text, model, custom_instructions)
	else:
	return _pos_tagging_traditional(text, model)

	def _extract_json_array(text: str) -> str:
	"""Extract JSON array from text, handling various formats."""
	import re

	# Try to find JSON array pattern
	json_match = re.search(r'\[\s\{.\}\s*\]', text, re.DOTALL)
	if json_match:
	return json_match.group(0)

	# If not found, try to find array between square brackets
	start = text.find('[')
	end = text.rfind(']')
	if start >= 0 and end > start:
	return text[start:end+1]

	return text

	def _pos_tagging_with_llm(
	text: str,
	model_name: str,
	custom_instructions: str = ""
	) -> Dict[str, List[str]]:
	"""Use LLM for more accurate and flexible POS tagging."""
	# Create the prompt with clear instructions
	prompt = """Analyze the following text and provide Part-of-Speech (POS) tags for each token.
	Return the result as a JSON array of objects with 'token' and 'tag' keys.

	Use standard Universal Dependencies POS tags:
	- ADJ: adjective
	- ADP: adposition
	- ADV: adverb
	- AUX: auxiliary verb
	- CONJ: coordinating conjunction
	- DET: determiner
	- INTJ: interjection
	- NOUN: noun
	- NUM: numeral
	- PART: particle
	- PRON: pronoun
	- PROPN: proper noun
	- PUNCT: punctuation
	- SCONJ: subordinating conjunction
	- SYM: symbol
	- VERB: verb
	- X: other

	Example output format:
	[
	{"token": "Hello", "tag": "INTJ"},
	{"token": "world", "tag": "NOUN"},
	{"token": ".", "tag": "PUNCT"}
	]

	Text to analyze:
	"""

	if custom_instructions:
	prompt = f"{custom_instructions}\n\n{prompt}"

	prompt += f'"{text}"'

	try:
	# Initialize LLM with lower temperature for more deterministic output
	llm = LLM(model=model_name, temperature=0.1, max_tokens=2000)

	# Get response from LLM
	response = llm.generate(prompt)
	print(f"LLM Raw Response: {response[:500]}...") # Log first 500 chars

	if not response.strip():
	raise ValueError("Empty response from LLM")

	# Extract JSON array from response
	json_str = _extract_json_array(response)
	if not json_str:
	raise ValueError("No JSON array found in response")

	# Parse the JSON
	try:
	pos_tags = json.loads(json_str)
	except json.JSONDecodeError as e:
	# Try to fix common JSON issues
	json_str = json_str.replace("'", '"')
	json_str = re.sub(r'(\w+):', r'"\1":', json_str) # Add quotes around keys
	pos_tags = json.loads(json_str)

	# Validate and extract tokens and tags
	if not isinstance(pos_tags, list):
	raise ValueError(f"Expected list, got {type(pos_tags).__name__}")

	tokens = []
	tags = []

	for item in pos_tags:
	if not isinstance(item, dict):
	continue

	token = item.get('token', '')
	tag = item.get('tag', '')

	if token and tag: # Only add if both token and tag are non-empty
	tokens.append(str(token).strip())
	tags.append(str(tag).strip())

	if not tokens or not tags:
	raise ValueError("No valid tokens and tags found in response")

	return {
	'tokens': tokens,
	'tags': tags
	}

	except Exception as e:
	print(f"Error in LLM POS tagging: {str(e)}")
	print(f"Falling back to traditional POS tagging...")
	return _pos_tagging_traditional(text, "en_core_web_sm")

	def _pos_tagging_traditional(text: str, model: str) -> Dict[str, List[str]]:
	"""Use traditional POS tagging models."""
	try:
	import spacy

	# Load the appropriate model
	try:
	nlp = spacy.load(model)
	except OSError:
	# Fallback to small English model if specified model is not found
	nlp = spacy.load("en_core_web_sm")

	# Process the text
	doc = nlp(text)

	# Extract tokens and POS tags
	tokens = []
	tags = []
	for token in doc:
	tokens.append(token.text)
	tags.append(token.pos_)

	return {
	'tokens': tokens,
	'tags': tags
	}

	except Exception as e:
	print(f"Error in traditional POS tagging: {str(e)}")
	return {"tokens": [], "tags": []}