Spaces:
Runtime error
Runtime error
File size: 1,647 Bytes
494e7c4 ca594bc 494e7c4 ca594bc 78a9a5b 5f4a64e 78a9a5b 494e7c4 ca594bc 78a9a5b 7531e94 78a9a5b 494e7c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
from pdf2image import convert_from_path
import pandas as pd, numpy as np
#from PIL import Image
#import pickle
import gradio as gr
from transformers import pipeline
import pandas as pd
#from sentence_transformers import SentenceTransformer, util
import pytesseract
from transformers import pipeline
pipe = pipeline(model="acrowth/preesme")
#model = SentenceTransformer('clip-ViT-B-32')
#with open('preesmefiletype.h5', 'rb') as file:
# pipe=pickle.load(file)
def findpdftype(file):
images = convert_from_path(file.name, first_page=1,last_page=1)
images[0].save('test.jpg')
return pipe('test.jpg')[0]['label']
def findpdftypeold2(file):
images = convert_from_path(file.name)
results=[]
for image in images:
ocr_df = pytesseract.image_to_data(image, output_type='data.frame')
ocr_df = ocr_df.dropna().reset_index(drop=True)
float_cols = ocr_df.select_dtypes('float').columns
ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True)
words = ' '.join([word for word in ocr_df.text if str(word) != 'nan'])
words = words[:1000]
results.append(words)
results=pipe(results)
return pd.DataFrame(results).label.value_counts().sort_values(ascending=False).index[0]
def findpdftypeold(file):
images = convert_from_path(file.name)
encodings=model.encode(images)
predictions=pipe.predict(encodings)
return pd.DataFrame(predictions, columns=['label']).label.value_counts().sort_values(ascending=False).index[0]
gr.Interface(fn=findpdftype, inputs="file", outputs="label").launch() |