acrowth commited on
Commit
78a9a5b
·
1 Parent(s): 6de47ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -4
app.py CHANGED
@@ -7,12 +7,29 @@ from transformers import pipeline
7
  import pandas as pd
8
  from sentence_transformers import SentenceTransformer, util
9
  import pickle
 
 
 
 
 
 
 
10
 
11
- model = SentenceTransformer('clip-ViT-B-32')
12
- with open('preesmefiletype.h5', 'rb') as file:
13
- pipe=pickle.load(file)
14
-
15
  def findpdftype(file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  images = convert_from_path(file.name)
17
  encodings=model.encode(images)
18
  predictions=pipe.predict(encodings)
 
7
  import pandas as pd
8
  from sentence_transformers import SentenceTransformer, util
9
  import pickle
10
+ import pytesseract
11
+ from transformers import pipeline
12
+ pipe = pipeline("text-classification",model="acrowth/autotrain-preesmetextclassifier-2437575785")
13
+
14
+ #model = SentenceTransformer('clip-ViT-B-32')
15
+ #with open('preesmefiletype.h5', 'rb') as file:
16
+ # pipe=pickle.load(file)
17
 
 
 
 
 
18
  def findpdftype(file):
19
+ images = convert_from_path(file.name)
20
+ results=[]
21
+ for image in images:
22
+ ocr_df = pytesseract.image_to_data(image, output_type='data.frame')
23
+ ocr_df = ocr_df.dropna().reset_index(drop=True)
24
+ float_cols = ocr_df.select_dtypes('float').columns
25
+ ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
26
+ ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True)
27
+ words = ' '.join([word for word in ocr_df.text if str(word) != 'nan'])
28
+ results.append(words)
29
+ results=pipe(results)
30
+ return pd.DataFrame(results).label.value_counts().sort_values(ascending=False).index[0]
31
+
32
+ def findpdftypeold(file):
33
  images = convert_from_path(file.name)
34
  encodings=model.encode(images)
35
  predictions=pipe.predict(encodings)