File size: 3,921 Bytes
894b24d
 
 
 
 
 
 
 
 
 
 
 
d4df546
894b24d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10eaeda
f43384a
894b24d
 
 
d3f1526
 
894b24d
d3f1526
d4df546
4b3d121
 
9b360b3
c9f9a75
 
b094a6f
c9f9a75
 
 
 
b094a6f
c9f9a75
 
953205d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4df546
 
cc4118d
d4df546
52ded96
894b24d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from metrics import calc_metrics
import gradio as gr
from openai import OpenAI
import os

from transformers import pipeline
# from dotenv import load_dotenv, find_dotenv
import huggingface_hub
import json
# from simcse import SimCSE # use for gpt
from evaluate_data import store_sample_data, get_metrics_trf

# store_sample_data()



with open('./data/sample_data.json', 'r') as f:
    # sample_data = [
    #     {'id': "", 'text': "", 'orgs': ["", ""]}
    # ]
    sample_data = json.load(f)
    
# _ = load_dotenv(find_dotenv()) # read local .env file
hf_token= os.environ['HF_TOKEN']
huggingface_hub.login(hf_token)

pipe = pipeline("token-classification", model="elshehawy/finer-ord-transformers", aggregation_strategy="first")


llm_model = 'gpt-3.5-turbo-0125'
# openai.api_key = os.environ['OPENAI_API_KEY']

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)


def get_completion(prompt, model=llm_model):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        messages=messages,
        model=model,
        temperature=0,
    )
    return response.choices[0].message.content


def find_orgs_gpt(sentence):
    prompt = f"""
    In context of named entity recognition (NER), find all organizations in the text delimited by triple backticks.
    
    text:
    ```
    {sentence}
    ```
    You should output only a list of organizations and follow this output format exactly: ["org_1", "org_2", "org_3"]
    """
    
    sent_orgs_str = get_completion(prompt)
    sent_orgs = json.loads(sent_orgs_str)
    
    return sent_orgs


    
# def find_orgs_trf(sentence):
#     org_list = []
#     for ent in pipe(sentence):
#         if ent['entity_group'] == 'ORG':
#             # message += f'\n- {ent["word"]} \t- score: {ent["score"]}'
#             # message += f'\n- {ent["word"]}'# \t- score: {ent["score"]}'
#             org_list.append(ent['word'])
#     return list(set(org_list))


true_orgs = [sent['orgs'] for sent in sample_data]

predicted_orgs_gpt = [find_orgs_gpt(sent['text']) for sent in sample_data]
# predicted_orgs_trf = [find_orgs_trf(sent['text']) for sent in sample_data]

all_metrics = {}

# sim_model = SimCSE('sentence-transformers/all-MiniLM-L6-v2')
# all_metrics['gpt'] = calc_metrics(true_orgs, predicted_orgs_gpt, sim_model)
print('Finiding all metrics trf')
# all_metrics['trf'] = get_metrics_trf()



example = """
My latest exclusive for The Hill : Conservative frustration over Republican efforts to force a House vote on reauthorizing the Export - Import Bank boiled over Wednesday during a contentious GOP meeting.

"""
def find_orgs(uploaded_file):
    print('=*'*80)
    print(type(uploaded_file))
    print(uploaded_file)
    try:
        print('inside try')
        print(uploaded_file.decode())
        uploaded_data = json.load(uploaded_file)
    
    except:
        print('inside except')
        print(uploaded_file.decode())
        uploaded_data = json.loads(uploaded_file)
    
    
    all_metrics = {}
    all_metrics['trf'] = get_metrics_trf(uploaded_data)
    
    store_sample_data(uploaded_data)
    with open('./data/sample_data.json', 'r') as f:
        sample_data = json.load(f)
    
    gpt_orgs, true_orgs = [], []
    
    for sent in sample_data:
        gpt_orgs.append(find_orgs_gpt(sent['text']))
        true_orgs.append(sent['orgs'])
        

    # sim_model = SimCSE('sentence-transformers/all-MiniLM-L6-v2')
    # all_metrics['gpt'] = calc_metrics(true_orgs, gpt_orgs, sim_model)        

    return 
# radio_btn = gr.Radio(choices=['GPT', 'iSemantics'], value='iSemantics', label='Available models', show_label=True)
# textbox = gr.Textbox(label="Enter your text", placeholder=str(all_metrics), lines=8)
upload_btn = gr.UploadButton(label='Upload a json file.', type='binary')

iface = gr.Interface(fn=find_orgs, inputs=upload_btn, outputs="text")
iface.launch(share=True)