legalpdfclass / app.py
acrowth's picture
Update app.py
5f4a64e
from pdf2image import convert_from_path
import pandas as pd, numpy as np
#from PIL import Image
#import pickle
import gradio as gr
from transformers import pipeline
import pandas as pd
#from sentence_transformers import SentenceTransformer, util
import pytesseract
from transformers import pipeline
pipe = pipeline(model="acrowth/preesme")
#model = SentenceTransformer('clip-ViT-B-32')
#with open('preesmefiletype.h5', 'rb') as file:
# pipe=pickle.load(file)
def findpdftype(file):
images = convert_from_path(file.name, first_page=1,last_page=1)
images[0].save('test.jpg')
return pipe('test.jpg')[0]['label']
def findpdftypeold2(file):
images = convert_from_path(file.name)
results=[]
for image in images:
ocr_df = pytesseract.image_to_data(image, output_type='data.frame')
ocr_df = ocr_df.dropna().reset_index(drop=True)
float_cols = ocr_df.select_dtypes('float').columns
ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True)
words = ' '.join([word for word in ocr_df.text if str(word) != 'nan'])
words = words[:1000]
results.append(words)
results=pipe(results)
return pd.DataFrame(results).label.value_counts().sort_values(ascending=False).index[0]
def findpdftypeold(file):
images = convert_from_path(file.name)
encodings=model.encode(images)
predictions=pipe.predict(encodings)
return pd.DataFrame(predictions, columns=['label']).label.value_counts().sort_values(ascending=False).index[0]
gr.Interface(fn=findpdftype, inputs="file", outputs="label").launch()