File size: 1,647 Bytes
494e7c4
 
ca594bc
 
494e7c4
 
 
ca594bc
78a9a5b
 
5f4a64e
78a9a5b
 
 
 
494e7c4
 
ca594bc
 
 
 
 
78a9a5b
 
 
 
 
 
 
 
 
7531e94
78a9a5b
 
 
 
 
494e7c4
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from pdf2image import convert_from_path
import pandas as pd, numpy as np
#from PIL import Image
#import pickle
import gradio as gr
from transformers import pipeline
import pandas as pd
#from sentence_transformers import SentenceTransformer, util
import pytesseract
from transformers import pipeline
pipe = pipeline(model="acrowth/preesme")

#model = SentenceTransformer('clip-ViT-B-32')
#with open('preesmefiletype.h5', 'rb') as file:
#    pipe=pickle.load(file)

def findpdftype(file):
    images = convert_from_path(file.name, first_page=1,last_page=1)
    images[0].save('test.jpg')
    return pipe('test.jpg')[0]['label']

def findpdftypeold2(file):
    images = convert_from_path(file.name)
    results=[]
    for image in images:
        ocr_df = pytesseract.image_to_data(image, output_type='data.frame')
        ocr_df = ocr_df.dropna().reset_index(drop=True)
        float_cols = ocr_df.select_dtypes('float').columns
        ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
        ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True)
        words = ' '.join([word for word in ocr_df.text if str(word) != 'nan'])
        words = words[:1000]
        results.append(words)
    results=pipe(results)
    return pd.DataFrame(results).label.value_counts().sort_values(ascending=False).index[0]
    
def findpdftypeold(file):
    images = convert_from_path(file.name)
    encodings=model.encode(images)
    predictions=pipe.predict(encodings)
    return pd.DataFrame(predictions, columns=['label']).label.value_counts().sort_values(ascending=False).index[0]

gr.Interface(fn=findpdftype, inputs="file", outputs="label").launch()