File size: 2,789 Bytes
5794470
2ead64f
 
 
 
 
 
 
 
5794470
2ead64f
 
 
 
 
5794470
2ead64f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5794470
2ead64f
 
 
 
 
 
 
 
 
 
 
 
 
5794470
2ead64f
5794470
2ead64f
5794470
2ead64f
 
 
 
 
5794470
2ead64f
 
 
5794470
2ead64f
 
 
5794470
2ead64f
 
 
 
 
 
5794470
2ead64f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import streamlit as st
import graphrag
import networkx as nx
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
import torch
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt', quiet=True)

@st.cache_resource
def load_models():
    # Load SentenceTransformer model for sentence embeddings
    sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
    return sentence_model

def text_to_graph(text, sentence_model):
    # Tokenize text into sentences
    sentences = sent_tokenize(text)
    
    # Create graph
    G = nx.Graph()
    
    # Add nodes (sentences) to the graph
    for i, sentence in enumerate(sentences):
        embedding = sentence_model.encode(sentence)
        G.add_node(i, text=sentence, embedding=embedding)
    
    # Add edges between sentences based on cosine similarity
    for i in range(len(sentences)):
        for j in range(i+1, len(sentences)):
            similarity = torch.cosine_similarity(
                torch.tensor(G.nodes[i]['embedding']),
                torch.tensor(G.nodes[j]['embedding']),
                dim=0
            )
            if similarity > 0.5:  # Adjust this threshold as needed
                G.add_edge(i, j, weight=similarity.item())
    
    return G, sentences

def analyze_text(text, sentence_model):
    G, sentences = text_to_graph(text, sentence_model)
    
    # Basic graph analysis
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    avg_degree = sum(dict(G.degree()).values()) / num_nodes
    
    # Identify important sentences using PageRank
    pagerank = nx.pagerank(G)
    important_sentences = sorted(pagerank, key=pagerank.get, reverse=True)[:3]
    
    return G, sentences, num_nodes, num_edges, avg_degree, important_sentences

st.title("GraphRAG-based Text Analysis")

sentence_model = load_models()

text_input = st.text_area("Enter text for analysis:", height=200)

if st.button("Analyze Text"):
    if text_input:
        G, sentences, num_nodes, num_edges, avg_degree, important_sentences = analyze_text(text_input, sentence_model)
        
        st.write(f"Number of sentences: {num_nodes}")
        st.write(f"Number of connections: {num_edges}")
        st.write(f"Average connections per sentence: {avg_degree:.2f}")
        
        st.subheader("Most important sentences:")
        for i in important_sentences:
            st.write(f"- {sentences[i]}")
        
        # Visualize graph
        plt.figure(figsize=(10, 6))
        pos = nx.spring_layout(G)
        nx.draw(G, pos, with_labels=False, node_size=30, node_color='lightblue', edge_color='gray')
        plt.title("Text as Graph")
        st.pyplot(plt)
        
    else:
        st.write("Please enter some text to analyze.")