Spaces:

zmbfeng
/

knowledge_extraction_a

Sleeping

App Files Files Community

zmbfeng commited on Aug 9, 2024

Commit

43d2a91

1 Parent(s): 2f33806

show GUI after init

Browse files

Files changed (1) hide show

app.py +40 -39

app.py CHANGED Viewed

@@ -69,6 +69,46 @@ def paraphrase(sentence):
     #results.append(line)
   return line
 big_text = """
     <div style='text-align: center;'>
         <h1 style='font-size: 30x;'>Knowledge Extraction A</h1>
@@ -113,46 +153,7 @@ if uploaded_json_file is not None:
         except json.JSONDecodeError:
             st.write('Invalid JSON file.')
         st.rerun()
-if 'is_initialized' not in st.session_state:
-    st.session_state['is_initialized'] = True
-    nltk.download('punkt')
-    nltk.download('stopwords')
-    st.session_state.stop_words = set(stopwords.words('english'))
-    st.session_state.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", )
-    st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
-    st.session_state.paraphrase_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
-    st.session_state.paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws").to('cuda')
-    print(str(st.session_state.paraphrase_model ))
-if 'list_count' in st.session_state:
-    st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
-    if 'paragraph_sentence_encodings' not in st.session_state:
-        print("start embedding paragarphs")
-        read_progress_bar = st.progress(0)
-        st.session_state.paragraph_sentence_encodings = []
-        for index,paragraph in enumerate(st.session_state.restored_paragraphs):
-            #print(paragraph)
-            progress_percentage = (index) / (st.session_state.list_count - 1)
-            # print(progress_percentage)
-            read_progress_bar.progress(progress_percentage)
-            sentence_encodings = []
-            sentences = sent_tokenize(paragraph['text'])
-            for sentence in sentences:
-                if sentence.strip().endswith('?'):
-                    sentence_encodings.append(None)
-                    continue
-                if len(sentence.strip()) < 4:
-                    sentence_encodings.append(None)
-                    continue
-                sentence_tokens = st.session_state.bert_tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to('cuda')
-                with torch.no_grad():
-                    sentence_encoding = st.session_state.bert_model(**sentence_tokens).last_hidden_state[:, 0, :].cpu().numpy()
-                sentence_encodings.append([sentence, sentence_encoding])
-                # sentence_encodings.append([sentence,bert_model(**sentence_tokens).last_hidden_state[:, 0, :].detach().numpy()])
-            st.session_state.paragraph_sentence_encodings.append([paragraph, sentence_encodings])
-        st.rerun()
 if 'paragraph_sentence_encodings' in st.session_state:
     query = st.text_input("Enter your query")

     #results.append(line)
   return line
+if 'is_initialized' not in st.session_state:
+    st.session_state['is_initialized'] = True
+    nltk.download('punkt')
+    nltk.download('stopwords')
+    st.session_state.stop_words = set(stopwords.words('english'))
+    st.session_state.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", )
+    st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
+    st.session_state.paraphrase_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
+    st.session_state.paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws").to('cuda')
+    print(str(st.session_state.paraphrase_model ))
+if 'list_count' in st.session_state:
+    st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
+    if 'paragraph_sentence_encodings' not in st.session_state:
+        print("start embedding paragarphs")
+        read_progress_bar = st.progress(0)
+        st.session_state.paragraph_sentence_encodings = []
+        for index,paragraph in enumerate(st.session_state.restored_paragraphs):
+            #print(paragraph)
+            progress_percentage = (index) / (st.session_state.list_count - 1)
+            # print(progress_percentage)
+            read_progress_bar.progress(progress_percentage)
+            sentence_encodings = []
+            sentences = sent_tokenize(paragraph['text'])
+            for sentence in sentences:
+                if sentence.strip().endswith('?'):
+                    sentence_encodings.append(None)
+                    continue
+                if len(sentence.strip()) < 4:
+                    sentence_encodings.append(None)
+                    continue
+                sentence_tokens = st.session_state.bert_tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to('cuda')
+                with torch.no_grad():
+                    sentence_encoding = st.session_state.bert_model(**sentence_tokens).last_hidden_state[:, 0, :].cpu().numpy()
+                sentence_encodings.append([sentence, sentence_encoding])
+                # sentence_encodings.append([sentence,bert_model(**sentence_tokens).last_hidden_state[:, 0, :].detach().numpy()])
+            st.session_state.paragraph_sentence_encodings.append([paragraph, sentence_encodings])
+        st.rerun()
 big_text = """
     <div style='text-align: center;'>
         <h1 style='font-size: 30x;'>Knowledge Extraction A</h1>
         except json.JSONDecodeError:
             st.write('Invalid JSON file.')
         st.rerun()
 if 'paragraph_sentence_encodings' in st.session_state:
     query = st.text_input("Enter your query")