turkish-named-entity-recognition-tests

Runtime error

App Files Files Community

turkish-named-entity-recognition-tests / app.py

umarigan

Update app.py

1c9f94a verified 9 months ago

raw

history blame

2.82 kB

	import streamlit as st
	import pandas as pd
	import spacy
	from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
	import PyPDF2
	import docx
	import io
	import re

	# ... [Previous functions remain unchanged] ...

	def create_mask_dict(entities):
	mask_dict = {}
	entity_counters = {}
	for entity in entities:
	if entity['entity_group'] not in ['CARDINAL', 'EVENT']:
	if entity['word'] not in mask_dict:
	if entity['entity_group'] not in entity_counters:
	entity_counters[entity['entity_group']] = 1
	else:
	entity_counters[entity['entity_group']] += 1
	mask_dict[entity['word']] = f"{entity['entity_group']}_{entity_counters[entity['entity_group']]}"
	return mask_dict

	def create_masked_text(input_text, mask_dict):
	masked_text = input_text
	for word, mask in sorted(mask_dict.items(), key=lambda x: len(x[0]), reverse=True):
	masked_text = re.sub(r'\b' + re.escape(word) + r'\b', mask, masked_text)
	return masked_text

	Run_Button = st.button("Run")

	if Run_Button and input_text:
	ner_pipeline = setModel(model_checkpoint, aggregation)

	# Chunk the input text
	chunks = chunk_text(input_text)

	# Process each chunk
	all_outputs = []
	for i, chunk in enumerate(chunks):
	output = ner_pipeline(chunk)

	# Adjust start and end positions for entities in chunks after the first
	if i > 0:
	offset = len(' '.join(chunks[:i])) + 1
	for entity in output:
	entity['start'] += offset
	entity['end'] += offset

	all_outputs.extend(output)

	# Combine entities
	output_comb = entity_comb(all_outputs)

	# Create mask dictionary
	mask_dict = create_mask_dict(output_comb)

	# Create masked text
	masked_text = create_masked_text(input_text, mask_dict)

	st.subheader("Masked Text")
	st.text(masked_text)

	st.subheader("Masking Dictionary")
	st.json(mask_dict)

	# Create a DataFrame for display
	df = pd.DataFrame([(word, mask) for word, mask in mask_dict.items()], columns=['Original', 'Masked'])
	st.subheader("Masking Table")
	st.dataframe(df)

	# Optional: Display original text with highlights
	st.subheader("Original Text with Highlights")
	spacy_display = {"ents": [], "text": input_text, "title": None}
	for entity in output_comb:
	if entity['entity_group'] not in ['CARDINAL', 'EVENT']:
	label = mask_dict[entity['word']]
	spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": label})

	html = spacy.displacy.render(spacy_display, style="ent", minify=True, manual=True)
	st.write(html, unsafe_allow_html=True)