not-lain commited on
Commit
320f164
ยท
1 Parent(s): a023810

error handling

Browse files
Files changed (1) hide show
  1. app.py +20 -11
app.py CHANGED
@@ -32,7 +32,7 @@ def process_pdfs(parent_dir: Union[str,list]):
32
  parent_dir = [parent_dir]
33
  for file_path in parent_dir:
34
  if ".pdf" not in file_path : # skip non pdf files
35
- continue
36
  # creating a pdf file object
37
  pdfFileObj = open(file_path, 'rb')
38
 
@@ -48,8 +48,8 @@ def process_pdfs(parent_dir: Union[str,list]):
48
  txt = txt.replace("\t","") # strip tabs
49
  txt = re.sub(r" +"," ",txt) # strip extra space
50
  # 512 is related to the positional encoding "facebook/dpr-ctx_encoder-single-nq-base" model
 
51
  if len(txt) < 512 :
52
- file_name = file_path.split("/")[-1]
53
  new_data = {"title":f"{file_name}-page-{i}","text":txt}
54
  df = df.append(new_data,ignore_index=True)
55
  else :
@@ -70,6 +70,8 @@ def process(example):
70
 
71
  def process_dataset(df):
72
  """processess the dataframe and returns a dataset variable"""
 
 
73
  ds = Dataset.from_pandas(df)
74
  ds = ds.map(process)
75
  ds.add_faiss_index(column='embeddings') # add faiss index
@@ -77,19 +79,26 @@ def process_dataset(df):
77
 
78
  def search(query, ds, k=3):
79
  """searches the query in the dataset and returns the k most similar"""
80
- tokens = q_tokenizer(query, return_tensors="pt")
81
- query_embed = q_encoder(**tokens)[0][0].numpy()
82
- scores, retrieved_examples = ds.get_nearest_examples("embeddings", query_embed, k=k)
83
- out = f"""title : {retrieved_examples["title"][0]},\ncontent: {retrieved_examples["text"][0]}
84
- similar resources: {retrieved_examples["title"]}
85
- """
 
 
 
86
  return out
87
 
88
  def predict(query,file_paths, k=3):
89
  """predicts the most similar files to the query"""
90
- df = process_pdfs(file_paths)
91
- ds = process_dataset(df)
92
- return search(query,ds,k=k)
 
 
 
 
93
 
94
  with gr.Blocks() as demo :
95
  with gr.Column():
 
32
  parent_dir = [parent_dir]
33
  for file_path in parent_dir:
34
  if ".pdf" not in file_path : # skip non pdf files
35
+ raise Exception("only pdf files are supported")
36
  # creating a pdf file object
37
  pdfFileObj = open(file_path, 'rb')
38
 
 
48
  txt = txt.replace("\t","") # strip tabs
49
  txt = re.sub(r" +"," ",txt) # strip extra space
50
  # 512 is related to the positional encoding "facebook/dpr-ctx_encoder-single-nq-base" model
51
+ file_name = file_path.split("/")[-1]
52
  if len(txt) < 512 :
 
53
  new_data = {"title":f"{file_name}-page-{i}","text":txt}
54
  df = df.append(new_data,ignore_index=True)
55
  else :
 
70
 
71
  def process_dataset(df):
72
  """processess the dataframe and returns a dataset variable"""
73
+ if len(df) == 0 :
74
+ raise Exception("empty pdf files, or can't read text from them")
75
  ds = Dataset.from_pandas(df)
76
  ds = ds.map(process)
77
  ds.add_faiss_index(column='embeddings') # add faiss index
 
79
 
80
  def search(query, ds, k=3):
81
  """searches the query in the dataset and returns the k most similar"""
82
+ try :
83
+ tokens = q_tokenizer(query, return_tensors="pt")
84
+ query_embed = q_encoder(**tokens)[0][0].numpy()
85
+ scores, retrieved_examples = ds.get_nearest_examples("embeddings", query_embed, k=k)
86
+ out = f"""title : {retrieved_examples["title"][0]},\ncontent: {retrieved_examples["text"][0]}
87
+ similar resources: {retrieved_examples["title"]}
88
+ """
89
+ except Exception as e:
90
+ out = f"error: {e}"
91
  return out
92
 
93
  def predict(query,file_paths, k=3):
94
  """predicts the most similar files to the query"""
95
+ try :
96
+ df = process_pdfs(file_paths)
97
+ ds = process_dataset(df)
98
+ out = search(query,ds,k=k)
99
+ except Exception as e:
100
+ out = f"error: {e}"
101
+ return out
102
 
103
  with gr.Blocks() as demo :
104
  with gr.Column():