turkish-named-entity-recognition-tests

Runtime error

File size: 6,567 Bytes

import streamlit as st
import pandas as pd
import spacy
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
import PyPDF2
import docx
import io

def chunk_text(text, chunk_size=128):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        if current_length + len(word) + 1 > chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = len(word)
        else:
            current_chunk.append(word)
            current_length += len(word) + 1

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

st.set_page_config(layout="wide")

# Function to read text from uploaded file
def read_file(file):
    if file.type == "text/plain":
        return file.getvalue().decode("utf-8")
    elif file.type == "application/pdf":
        pdf_reader = PyPDF2.PdfReader(io.BytesIO(file.getvalue()))
        return " ".join(page.extract_text() for page in pdf_reader.pages)
    elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        doc = docx.Document(io.BytesIO(file.getvalue()))
        return " ".join(paragraph.text for paragraph in doc.paragraphs)
    else:
        st.error("Unsupported file type")
        return None

st.title("Turkish NER Models Testing")

model_list = [
    'girayyagmur/bert-base-turkish-ner-cased',
    'savasy/bert-base-turkish-ner-cased',
    'xlm-roberta-large-finetuned-conll03-english',
    'asahi417/tner-xlm-roberta-base-ontonotes5'
]

st.sidebar.header("Select NER Model")
model_checkpoint = st.sidebar.radio("", model_list)

st.sidebar.write("For details of models: 'https://huggingface.co/akdeniz27/")
st.sidebar.write("Only PDF, DOCX, and TXT files are supported.")

# Determine aggregation strategy
aggregation = "simple" if model_checkpoint in ["akdeniz27/xlm-roberta-base-turkish-ner", "xlm-roberta-large-finetuned-conll03-english", "asahi417/tner-xlm-roberta-base-ontonotes5"] else "first"

st.subheader("Select Text Input Method")
input_method = st.radio("", ('Write or Paste New Text', 'Upload File'))

if input_method == "Write or Paste New Text":
    input_text = st.text_area('Write or Paste Text Below', value="", height=128)
else:
    uploaded_file = st.file_uploader("Choose a file", type=["txt", "pdf", "docx"])
    if uploaded_file is not None:
        input_text = read_file(uploaded_file)
        if input_text:
            st.text_area("Extracted Text", input_text, height=128)
    else:
        input_text = ""

@st.cache_resource
def setModel(model_checkpoint, aggregation):
    model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    return pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy=aggregation)

@st.cache_resource
def entity_comb(output):
    output_comb = []
    for ind, entity in enumerate(output):
        if ind == 0:
            output_comb.append(entity)
        elif output[ind]["start"] == output[ind-1]["end"] and output[ind]["entity_group"] == output[ind-1]["entity_group"]:
            output_comb[-1]["word"] += output[ind]["word"]
            output_comb[-1]["end"] = output[ind]["end"]
        else:
            output_comb.append(entity)
    return output_comb

def create_masked_text(input_text, entities):
    # Create the mask dictionary
    mask_dict = create_mask_dict(entities)
    
    masked_text = input_text
    for entity in sorted(entities, key=lambda x: x['start'], reverse=True):
        if entity['entity_group'] not in ['CARDINAL', 'EVENT']:
            # Replace the entity with its entity group from the mask dictionary
            masked_text = (
                masked_text[:entity['start']] +
                f"<{mask_dict[entity['word']]}> " +  # Use angle brackets for clarity
                masked_text[entity['end']:]
            )
    return masked_text
def create_masked_text(input_text, entities, mask_dict):
    masked_text = input_text
    for entity in sorted(entities, key=lambda x: x['start'], reverse=True):
        if entity['entity_group'] not in ['CARDINAL', 'EVENT']:
            masked_text = masked_text[:entity['start']] + mask_dict[entity['word']] + masked_text[entity['end']:]
    return masked_text


Run_Button = st.button("Run")

if Run_Button and input_text:
    ner_pipeline = setModel(model_checkpoint, aggregation)
    
    # Chunk the input text
    chunks = chunk_text(input_text)
    
    # Process each chunk
    all_outputs = []
    for i, chunk in enumerate(chunks):
        output = ner_pipeline(chunk)
        
        # Adjust start and end positions for entities in chunks after the first
        if i > 0:
            offset = len(' '.join(chunks[:i])) + 1
            for entity in output:
                entity['start'] += offset
                entity['end'] += offset
        
        all_outputs.extend(output)
        
    
    # Combine entities
    
    output_comb = entity_comb(all_outputs)
    
    # Create mask dictionary
    mask_dict = create_mask_dict(output_comb)

    masked_text = create_masked_text(input_text, output_comb, mask_dict)
    
    # Apply masking and add masked_word column
    for entity in output_comb:
        if entity['entity_group'] not in ['CARDINAL', 'EVENT']:
            entity['masked_word'] = mask_dict.get(entity['word'], entity['word'])
        else:
            entity['masked_word'] = entity['word']
    print("output_comb", output_comb)
    #df = pd.DataFrame.from_dict(output_comb)
    #cols_to_keep = ['word', 'entity_group', 'score', 'start', 'end']
    #df_final = df[cols_to_keep].loc[:,~df.columns.duplicated()].copy()
    
    #st.subheader("Recognized Entities")
    #st.dataframe(df_final)

    
    
    # Spacy display logic with entity numbering
    spacy_display = {"ents": [], "text": input_text, "title": None}
    for entity in output_comb:
        if entity['entity_group'] not in ['CARDINAL', 'EVENT']:
            label = f"{entity['entity_group']}_{mask_dict[entity['word']].split('_')[1]}"
        else:
            label = entity['entity_group']
        spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": label})
    
    html = spacy.displacy.render(spacy_display, style="ent", minify=True, manual=True)
    st.write(html, unsafe_allow_html=True)

    st.subheader("Masking Dictionary")
    st.json(mask_dict)

    st.subheader("Masked Text Preview")
    st.text(masked_text)