kxx-kkk commited on
Commit
505522b
·
verified ·
1 Parent(s): 2b5f59d

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -55
app.py CHANGED
@@ -30,63 +30,63 @@ def question_model():
30
  question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer, handle_impossible_answer=True)
31
  return question_answerer
32
 
33
- # get the answer by passing the context & question to the model
34
- def question_answering(context, question):
35
- with st.spinner(text="Loading question model..."):
36
- question_answerer = question_model()
37
- with st.spinner(text="Getting answer..."):
38
- answer = question_answerer(context=context, question=question)
39
- print(answer)
40
- answer_score = str(answer["score"])
41
- answer = answer["answer"]
42
- if (answer==""):
43
- answer = "CANNOT ANSWER"
44
-
45
- # display the result in container
46
- container = st.container(border=True)
47
- container.write("<h5><b>Answer:</b></h5>"+answer+"<p><small>(F1 score: "+answer_score+")</small></p><br>", unsafe_allow_html=True)
48
-
49
  # def question_answering(context, question):
50
  # with st.spinner(text="Loading question model..."):
51
  # question_answerer = question_model()
52
  # with st.spinner(text="Getting answer..."):
53
- # segment_size = 45000
54
- # overlap_size = 50
55
- # text_length = len(context)
56
- # segments = []
57
-
58
- # # Split context into segments
59
- # for i in range(0, text_length, segment_size - overlap_size):
60
- # segment_start = i
61
- # segment_end = i + segment_size
62
- # segment = context[segment_start:segment_end]
63
- # segments.append(segment)
64
-
65
- # answers = {} # Dictionary to store answers for each segment
66
-
67
- # # Get answers for each segment
68
- # for i, segment in enumerate(segments):
69
- # answer = question_answerer(context=segment, question=question)
70
- # answers[i] = answer
71
-
72
- # # Find the answer with the highest score
73
- # highest_score = -1
74
- # highest_answer = None
75
- # for segment_index, answer in answers.items():
76
- # print(answer)
77
- # score = answer["score"]
78
- # if score > highest_score:
79
- # highest_score = score
80
- # highest_answer = answer
81
-
82
- # if highest_answer is not None:
83
- # answer = highest_answer["answer"]
84
- # answer_score = str(highest_answer["score"])
85
-
86
- # # Display the result in container
87
- # container = st.container(border=True)
88
- # container.write("<h5><b>Answer:</b></h5>" + answer + "<p><small>(F1 score: " + answer_score + ")</small></p><br>",
89
- # unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  @st.cache_data(show_spinner=False)
92
  def extract_text(file_path):
@@ -106,8 +106,8 @@ def extract_text(file_path):
106
  for i, image in enumerate(images):
107
  image_text += pytesseract.image_to_string(image)
108
 
109
- # text = text + image_text
110
- text = image_text
111
  # remove more than one new line
112
  text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
113
  return text
 
30
  question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer, handle_impossible_answer=True)
31
  return question_answerer
32
 
33
+ # # get the answer by passing the context & question to the model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  # def question_answering(context, question):
35
  # with st.spinner(text="Loading question model..."):
36
  # question_answerer = question_model()
37
  # with st.spinner(text="Getting answer..."):
38
+ # answer = question_answerer(context=context, question=question)
39
+ # print(answer)
40
+ # answer_score = str(answer["score"])
41
+ # answer = answer["answer"]
42
+ # if (answer==""):
43
+ # answer = "CANNOT ANSWER"
44
+
45
+ # # display the result in container
46
+ # container = st.container(border=True)
47
+ # container.write("<h5><b>Answer:</b></h5>"+answer+"<p><small>(F1 score: "+answer_score+")</small></p><br>", unsafe_allow_html=True)
48
+
49
+ def question_answering(context, question):
50
+ with st.spinner(text="Loading question model..."):
51
+ question_answerer = question_model()
52
+ with st.spinner(text="Getting answer..."):
53
+ segment_size = 45000
54
+ overlap_size = 50
55
+ text_length = len(context)
56
+ segments = []
57
+
58
+ # Split context into segments
59
+ for i in range(0, text_length, segment_size - overlap_size):
60
+ segment_start = i
61
+ segment_end = i + segment_size
62
+ segment = context[segment_start:segment_end]
63
+ segments.append(segment)
64
+
65
+ answers = {} # Dictionary to store answers for each segment
66
+
67
+ # Get answers for each segment
68
+ for i, segment in enumerate(segments):
69
+ answer = question_answerer(context=segment, question=question)
70
+ answers[i] = answer
71
+
72
+ # Find the answer with the highest score
73
+ highest_score = -1
74
+ highest_answer = None
75
+ for answer in answers.items():
76
+ print(answer)
77
+ score = answer["score"]
78
+ if score > highest_score:
79
+ highest_score = score
80
+ highest_answer = answer
81
+
82
+ if highest_answer is not None:
83
+ answer = highest_answer["answer"]
84
+ answer_score = str(highest_answer["score"])
85
+
86
+ # Display the result in container
87
+ container = st.container(border=True)
88
+ container.write("<h5><b>Answer:</b></h5>" + answer + "<p><small>(F1 score: " + answer_score + ")</small></p><br>",
89
+ unsafe_allow_html=True)
90
 
91
  @st.cache_data(show_spinner=False)
92
  def extract_text(file_path):
 
106
  for i, image in enumerate(images):
107
  image_text += pytesseract.image_to_string(image)
108
 
109
+ text = text + image_text
110
+ # text = image_text
111
  # remove more than one new line
112
  text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
113
  return text