from pdf2image import convert_from_path import pandas as pd, numpy as np #from PIL import Image #import pickle import gradio as gr from transformers import pipeline import pandas as pd #from sentence_transformers import SentenceTransformer, util import pytesseract from transformers import pipeline pipe = pipeline(model="acrowth/preesme") #model = SentenceTransformer('clip-ViT-B-32') #with open('preesmefiletype.h5', 'rb') as file: # pipe=pickle.load(file) def findpdftype(file): images = convert_from_path(file.name, first_page=1,last_page=1) images[0].save('test.jpg') return pipe('test.jpg')[0]['label'] def findpdftypeold2(file): images = convert_from_path(file.name) results=[] for image in images: ocr_df = pytesseract.image_to_data(image, output_type='data.frame') ocr_df = ocr_df.dropna().reset_index(drop=True) float_cols = ocr_df.select_dtypes('float').columns ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int) ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True) words = ' '.join([word for word in ocr_df.text if str(word) != 'nan']) words = words[:1000] results.append(words) results=pipe(results) return pd.DataFrame(results).label.value_counts().sort_values(ascending=False).index[0] def findpdftypeold(file): images = convert_from_path(file.name) encodings=model.encode(images) predictions=pipe.predict(encodings) return pd.DataFrame(predictions, columns=['label']).label.value_counts().sort_values(ascending=False).index[0] gr.Interface(fn=findpdftype, inputs="file", outputs="label").launch()