Spaces:
Runtime error
Runtime error
from pdf2image import convert_from_path | |
import pandas as pd, numpy as np | |
#from PIL import Image | |
#import pickle | |
import gradio as gr | |
from transformers import pipeline | |
import pandas as pd | |
#from sentence_transformers import SentenceTransformer, util | |
import pytesseract | |
from transformers import pipeline | |
pipe = pipeline(model="acrowth/preesme") | |
#model = SentenceTransformer('clip-ViT-B-32') | |
#with open('preesmefiletype.h5', 'rb') as file: | |
# pipe=pickle.load(file) | |
def findpdftype(file): | |
images = convert_from_path(file.name, first_page=1,last_page=1) | |
images[0].save('test.jpg') | |
return pipe('test.jpg')[0]['label'] | |
def findpdftypeold2(file): | |
images = convert_from_path(file.name) | |
results=[] | |
for image in images: | |
ocr_df = pytesseract.image_to_data(image, output_type='data.frame') | |
ocr_df = ocr_df.dropna().reset_index(drop=True) | |
float_cols = ocr_df.select_dtypes('float').columns | |
ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int) | |
ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True) | |
words = ' '.join([word for word in ocr_df.text if str(word) != 'nan']) | |
words = words[:1000] | |
results.append(words) | |
results=pipe(results) | |
return pd.DataFrame(results).label.value_counts().sort_values(ascending=False).index[0] | |
def findpdftypeold(file): | |
images = convert_from_path(file.name) | |
encodings=model.encode(images) | |
predictions=pipe.predict(encodings) | |
return pd.DataFrame(predictions, columns=['label']).label.value_counts().sort_values(ascending=False).index[0] | |
gr.Interface(fn=findpdftype, inputs="file", outputs="label").launch() |