Spaces:

acrowth
/

legalpdfclass

Runtime error

acrowth commited on Dec 12, 2022

Commit

78a9a5b

1 Parent(s): 6de47ae

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -7,12 +7,29 @@ from transformers import pipeline
 import pandas as pd
 from sentence_transformers import SentenceTransformer, util
 import pickle
-model = SentenceTransformer('clip-ViT-B-32')
-with open('preesmefiletype.h5', 'rb') as file:
-    pipe=pickle.load(file)
 def findpdftype(file):
     images = convert_from_path(file.name)
     encodings=model.encode(images)
     predictions=pipe.predict(encodings)

 import pandas as pd
 from sentence_transformers import SentenceTransformer, util
 import pickle
+import pytesseract
+from transformers import pipeline
+pipe = pipeline("text-classification",model="acrowth/autotrain-preesmetextclassifier-2437575785")
+#model = SentenceTransformer('clip-ViT-B-32')
+#with open('preesmefiletype.h5', 'rb') as file:
+#    pipe=pickle.load(file)
 def findpdftype(file):
+    images = convert_from_path(file.name)
+    results=[]
+    for image in images:
+        ocr_df = pytesseract.image_to_data(image, output_type='data.frame')
+        ocr_df = ocr_df.dropna().reset_index(drop=True)
+        float_cols = ocr_df.select_dtypes('float').columns
+        ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
+        ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True)
+        words = ' '.join([word for word in ocr_df.text if str(word) != 'nan'])
+        results.append(words)
+    results=pipe(results)
+    return pd.DataFrame(results).label.value_counts().sort_values(ascending=False).index[0]
+def findpdftypeold(file):
     images = convert_from_path(file.name)
     encodings=model.encode(images)
     predictions=pipe.predict(encodings)