Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -7,12 +7,29 @@ from transformers import pipeline
|
|
7 |
import pandas as pd
|
8 |
from sentence_transformers import SentenceTransformer, util
|
9 |
import pickle
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
model = SentenceTransformer('clip-ViT-B-32')
|
12 |
-
with open('preesmefiletype.h5', 'rb') as file:
|
13 |
-
pipe=pickle.load(file)
|
14 |
-
|
15 |
def findpdftype(file):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
images = convert_from_path(file.name)
|
17 |
encodings=model.encode(images)
|
18 |
predictions=pipe.predict(encodings)
|
|
|
7 |
import pandas as pd
|
8 |
from sentence_transformers import SentenceTransformer, util
|
9 |
import pickle
|
10 |
+
import pytesseract
|
11 |
+
from transformers import pipeline
|
12 |
+
pipe = pipeline("text-classification",model="acrowth/autotrain-preesmetextclassifier-2437575785")
|
13 |
+
|
14 |
+
#model = SentenceTransformer('clip-ViT-B-32')
|
15 |
+
#with open('preesmefiletype.h5', 'rb') as file:
|
16 |
+
# pipe=pickle.load(file)
|
17 |
|
|
|
|
|
|
|
|
|
18 |
def findpdftype(file):
|
19 |
+
images = convert_from_path(file.name)
|
20 |
+
results=[]
|
21 |
+
for image in images:
|
22 |
+
ocr_df = pytesseract.image_to_data(image, output_type='data.frame')
|
23 |
+
ocr_df = ocr_df.dropna().reset_index(drop=True)
|
24 |
+
float_cols = ocr_df.select_dtypes('float').columns
|
25 |
+
ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
|
26 |
+
ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True)
|
27 |
+
words = ' '.join([word for word in ocr_df.text if str(word) != 'nan'])
|
28 |
+
results.append(words)
|
29 |
+
results=pipe(results)
|
30 |
+
return pd.DataFrame(results).label.value_counts().sort_values(ascending=False).index[0]
|
31 |
+
|
32 |
+
def findpdftypeold(file):
|
33 |
images = convert_from_path(file.name)
|
34 |
encodings=model.encode(images)
|
35 |
predictions=pipe.predict(encodings)
|