Spaces:

zmbfeng
/

knowledge_extraction_a

Sleeping

App Files Files Community

zmbfeng commited on Jul 21, 2024

Commit

561a0db

1 Parent(s): 897250e

embedding paragraphs

Browse files

Files changed (2) hide show

app.py +47 -5
requirements.txt +4 -0

app.py CHANGED Viewed

@@ -1,6 +1,16 @@
 import streamlit as st
 import os
 import json
 def is_new_file_upload(uploaded_file):
     if 'last_uploaded_file' in st.session_state:
         # Check if the newly uploaded file is different from the last one
@@ -44,18 +54,50 @@ if uploaded_json_file is not None:
             # print("page_count=",st.session_state.page_count)
         content = uploaded_json_file.read()
         try:
-            data = json.loads(content)
             #print(data)
             # Check if the parsed data is a dictionary
-            if isinstance(data, list):
-                # Count the number of top-level elements
-                st.session_state.list_count  = len(data)
                 st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
             else:
                 st.write('The JSON content is not a dictionary.')
         except json.JSONDecodeError:
             st.write('Invalid JSON file.')
         st.rerun()
 if 'list_count' in st.session_state:
-    st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')

 import streamlit as st
 import os
 import json
+from transformers import GPT2Tokenizer, GPT2LMHeadModel, BertTokenizer, BertModel,T5Tokenizer, T5ForConditionalGeneration,AutoTokenizer, AutoModelForSeq2SeqLM
+import torch
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import nltk
+from nltk.tokenize import sent_tokenize
 def is_new_file_upload(uploaded_file):
     if 'last_uploaded_file' in st.session_state:
         # Check if the newly uploaded file is different from the last one
             # print("page_count=",st.session_state.page_count)
         content = uploaded_json_file.read()
         try:
+            st.session_state.restored_paragraphs = json.loads(content)
             #print(data)
             # Check if the parsed data is a dictionary
+            if isinstance(st.session_state.restored_paragraphs, list):
+                # Count the restored_paragraphs of top-level elements
+                st.session_state.list_count  = len(st.session_state.restored_paragraphs)
                 st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
             else:
                 st.write('The JSON content is not a dictionary.')
         except json.JSONDecodeError:
             st.write('Invalid JSON file.')
         st.rerun()
+if 'is_initialized' not in st.session_state:
+    st.session_state['is_initialized'] = True
+    nltk.download('punkt')
+    st.session_state.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", )
+    st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
 if 'list_count' in st.session_state:
+    st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
+    if 'paragraph_sentence_encodings' not in st.session_state:
+        print("start embedding paragarphs")
+        read_progress_bar = st.progress(0)
+        st.session_state.paragraph_sentence_encodings = []
+        for index,paragraph in enumerate(st.session_state.restored_paragraphs):
+            #print(paragraph)
+            progress_percentage = (index) / (st.session_state.list_count - 1)
+            print(progress_percentage)
+            read_progress_bar.progress(progress_percentage)
+            sentence_encodings = []
+            sentences = sent_tokenize(paragraph['text'])
+            for sentence in sentences:
+                if sentence.strip().endswith('?'):
+                    sentence_encodings.append(None)
+                    continue
+                if len(sentence.strip()) < 4:
+                    sentence_encodings.append(None)
+                    continue
+                sentence_tokens = st.session_state.bert_tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to('cuda')
+                with torch.no_grad():
+                    sentence_encoding = st.session_state.bert_model(**sentence_tokens).last_hidden_state[:, 0, :].cpu().numpy()
+                sentence_encodings.append([sentence, sentence_encoding])
+                # sentence_encodings.append([sentence,bert_model(**sentence_tokens).last_hidden_state[:, 0, :].detach().numpy()])
+            st.session_state.paragraph_sentence_encodings.append([paragraph, sentence_encodings])

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+transformers
+torch
+scikit-learn
+nltk