File size: 4,628 Bytes
953a835 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import gradio as gr
import Wikidata_Text_Parser as wtr
import sqlite3
import Prove_lite as prv
import pandas as pd
import numpy as np
import os
def wtr_process(qid):
try:
conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
target_QID = qid
query = f"SELECT * FROM {'claim_text'}"
df = pd.read_sql_query(query, conn)
if target_QID in df['entity_id'].unique():
pass
else:
wtr.claimParser(target_QID) #save results in .db
filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
url_set = wtr.urlParser(target_QID) #from ref table in .db
html_set = wtr.htmlParser(url_set, target_QID) #Original html docs collection
claim_text = wtr.claim2text(html_set) #Claims generation
html_text = wtr.html2text(html_set)
claim_text = claim_text.astype(str)
html_text = html_text.astype(str)
claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
html_text.to_sql('html_text', conn, if_exists='replace', index=False)
conn.commit()
query = f"""
SELECT
claim_text.entity_label,
claim_text.property_label,
claim_text.object_label,
html_text.url
FROM claim_text
INNER JOIN html_text ON claim_text.reference_id = html_text.reference_id
WHERE claim_text.entity_id = '{target_QID}'
"""
result_df = pd.read_sql_query(query, conn)
conn.commit()
conn.close()
return result_df
except Exception as e:
error_df = pd.DataFrame({'Error': [str(e)]})
return error_df
def prv_process(qid):
target_QID = qid
conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
query = f"SELECT * FROM claim_text WHERE entity_id = '{target_QID}'"
claim_df = pd.read_sql_query(query, conn)
query = f"SELECT * FROM html_text Where entity_id = '{target_QID}'"
reference_text_df = pd.read_sql_query(query, conn)
verbalised_claims_df_final = prv.verbalisation(claim_df)
progress = gr.Progress(len(verbalised_claims_df_final)) # Create progress bar for Gradio
def update_progress(curr_step, total_steps):
progress((curr_step + 1) / total_steps)
splited_sentences_from_html = prv.setencesSpliter(verbalised_claims_df_final, reference_text_df, update_progress)
BATCH_SIZE = 512
N_TOP_SENTENCES = 5
SCORE_THRESHOLD = 0
evidence_df = prv.evidenceSelection(splited_sentences_from_html, BATCH_SIZE, N_TOP_SENTENCES)
result = prv.textEntailment(evidence_df, SCORE_THRESHOLD)
display_df = prv.TableMaking(verbalised_claims_df_final, result)
conn.commit()
conn.close()
return display_df
with gr.Blocks() as demo:
print("gradio started!")
gr.Markdown(
"""
# Prove
This is a tool for verifying the reference quality of Wikidata claims related to the target entity item.
"""
)
inp = gr.Textbox(label="Input QID", placeholder="Input QID (i.e. Q245247)")
out = gr.Dataframe(label="Parsing result (not presenting parsed HTMLs)", headers=["entity_label", "property_label", "object_label", "url"])
run_button_1 = gr.Button("Start parsing")
run_button_1.click(wtr_process, inp, out)
gr.Markdown(
"""
Pre-trained language models-based text entailment.
"""
)
out_2 = gr.HTML(label="Results")
run_button_2 = gr.Button("Start processing")
run_button_2.click(prv_process, inp, out_2)
if __name__ == "__main__":
#DB initialising
if os.path.isfile('wikidata_claims_refs_parsed.db') != True:
conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
target_QID = 'Q115305900'
wtr.claimParser(target_QID) #save results in .db
filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
url_set = wtr.urlParser(target_QID) #from ref table in .db
html_set = wtr.htmlParser(url_set, target_QID) #Original html docs collection
claim_text = wtr.claim2text(html_set) #Claims generation
html_text = wtr.html2text(html_set)
claim_text = claim_text.astype(str)
html_text = html_text.astype(str)
claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
html_text.to_sql('html_text', conn, if_exists='replace', index=False)
conn.commit()
conn.close()
demo.launch(share=True) |