|
import os |
|
os.system('pip install paddlepaddle') |
|
os.system('pip install paddleocr') |
|
from paddleocr import PaddleOCR, draw_ocr |
|
from PIL import Image |
|
import gradio as gr |
|
import torch |
|
|
|
torch.hub.download_url_to_file('https://i.imgur.com/aqMBT0i.jpg', 'example.jpg') |
|
|
|
def inference(img, lang): |
|
ocr = PaddleOCR(use_angle_cls=True, lang=lang,use_gpu=False) |
|
img_path = img.name |
|
result = ocr.ocr(img_path, cls=True) |
|
image = Image.open(img_path).convert('RGB') |
|
boxes = [line[0] for line in result] |
|
txts = [line[1][0] for line in result] |
|
|
|
im_show = draw_ocr(image, boxes, txts, |
|
font_path='simfang.ttf') |
|
im_show = Image.fromarray(im_show) |
|
im_show.save('result.jpg') |
|
return 'result.jpg' |
|
|
|
title = 'A Framework for Data-Driven Document Evaluation and scoring - Image to Text Extraction ' |
|
description = 'Demo for Optical character recognition(OCR)' |
|
article = "" |
|
examples = [['example.jpg','en']] |
|
css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}" |
|
gr.Interface( |
|
inference, |
|
[gr.inputs.Image(type='file', label='Input'),gr.inputs.Dropdown(choices=['ch', 'en', 'fr', 'german', 'korean', 'japan'], type="value", default='en', label='language')], |
|
gr.outputs.Image(type='file', label='Output'), |
|
title=title, |
|
description=description, |
|
article=article, |
|
examples=examples, |
|
css=css, |
|
enable_queue=True |
|
).launch(debug=True) |
|
|
|
|
|
|
|
|
|
import os |
|
import gradio as gr |
|
from huggingface_hub import snapshot_download |
|
from prettytable import PrettyTable |
|
import pandas as pd |
|
import torch |
|
import traceback |
|
|
|
config = { |
|
"model_type": "roberta", |
|
"model_name_or_path": "roberta-large", |
|
"logic_lambda": 0.5, |
|
"prior": "random", |
|
"mask_rate": 0.0, |
|
"cand_k": 1, |
|
"max_seq1_length": 256, |
|
"max_seq2_length": 128, |
|
"max_num_questions": 8, |
|
"do_lower_case": False, |
|
"seed": 42, |
|
"n_gpu": torch.cuda.device_count(), |
|
} |
|
|
|
os.system('git clone https://github.com/kkpathak91/project_metch/') |
|
os.system('rm -r project_metch/data/') |
|
os.system('rm -r project_metch/results/') |
|
os.system('rm -r project_metch/models/') |
|
os.system('mv project_metch/* ./') |
|
|
|
model_dir = snapshot_download('kkpathak91/FVM') |
|
config['fc_dir'] = os.path.join(model_dir, 'fact_checking/roberta-large/') |
|
config['mrc_dir'] = os.path.join(model_dir, 'mrc_seq2seq/bart-base/') |
|
config['er_dir'] = os.path.join(model_dir, 'evidence_retrieval/') |
|
|
|
|
|
from src.loren import Loren |
|
|
|
|
|
loren = Loren(config, verbose=False) |
|
try: |
|
js = loren.check('Donald Trump won the 2020 U.S. presidential election.') |
|
except Exception as e: |
|
raise ValueError(e) |
|
|
|
|
|
def highlight_phrase(text, phrase): |
|
text = loren.fc_client.tokenizer.clean_up_tokenization(text) |
|
return text.replace('<mask>', f'<i><b>{phrase}</b></i>') |
|
|
|
|
|
def highlight_entity(text, entity): |
|
return text.replace(entity, f'<i><b>{entity}</b></i>') |
|
|
|
|
|
def gradio_formatter(js, output_type): |
|
zebra_css = ''' |
|
tr:nth-child(even) { |
|
background: #f1f1f1; |
|
} |
|
thead{ |
|
background: #f1f1f1; |
|
}''' |
|
if output_type == 'e': |
|
data = {'Evidence': [highlight_entity(x, e) for x, e in zip(js['evidence'], js['entities'])]} |
|
elif output_type == 'z': |
|
p_sup, p_ref, p_nei = [], [], [] |
|
for x in js['phrase_veracity']: |
|
max_idx = torch.argmax(torch.tensor(x)).tolist() |
|
x = ['%.4f' % xx for xx in x] |
|
x[max_idx] = f'<i><b>{x[max_idx]}</b></i>' |
|
p_sup.append(x[2]) |
|
p_ref.append(x[0]) |
|
p_nei.append(x[1]) |
|
|
|
data = { |
|
'Claim Phrase': js['claim_phrases'], |
|
'Local Premise': [highlight_phrase(q, x[0]) for q, x in zip(js['cloze_qs'], js['evidential'])], |
|
'p_SUP': p_sup, |
|
'p_REF': p_ref, |
|
'p_NEI': p_nei, |
|
} |
|
else: |
|
raise NotImplementedError |
|
data = pd.DataFrame(data) |
|
pt = PrettyTable(field_names=list(data.columns), |
|
align='l', border=True, hrules=1, vrules=1) |
|
for v in data.values: |
|
pt.add_row(v) |
|
html = pt.get_html_string(attributes={ |
|
'style': 'border-width: 2px; bordercolor: black' |
|
}, format=True) |
|
html = f'<head> <style type="text/css"> {zebra_css} </style> </head>\n' + html |
|
html = html.replace('<', '<').replace('>', '>') |
|
return html |
|
|
|
|
|
def run(claim): |
|
try: |
|
js = loren.check(claim) |
|
except Exception as error_msg: |
|
exc = traceback.format_exc() |
|
msg = f'[Error]: {error_msg}.\n[Traceback]: {exc}' |
|
loren.logger.error(claim) |
|
loren.logger.error(msg) |
|
return 'Oops, something went wrong.', '', '' |
|
label = js['claim_veracity'] |
|
loren.logger.warning(label + str(js)) |
|
ev_html = gradio_formatter(js, 'e') |
|
z_html = gradio_formatter(js, 'z') |
|
return label, z_html, ev_html |
|
|
|
|
|
iface = gr.Interface( |
|
fn=run, |
|
inputs="text", |
|
outputs=[ |
|
'text', |
|
'html', |
|
'html', |
|
], |
|
examples=['Kanpur is a city in Nepal', |
|
'PV Sindhu is an Indian Badminton Player.'], |
|
title="A Framework for Data-Driven Document Evaluation and Scoring", |
|
layout='horizontal', |
|
description="[Student Name: Karan Kumar Pathak] " " [Roll No.: 2020fc04334] ", |
|
flagging_dir='results/flagged/', |
|
allow_flagging=True, |
|
flagging_options=['Interesting!', 'Error: Claim Phrase Parsing', 'Error: Local Premise', |
|
'Error: Require Commonsense', 'Error: Evidence Retrieval'], |
|
enable_queue=True |
|
) |
|
iface.launch() |