File size: 3,150 Bytes
1122e93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f6359f9
 
1122e93
 
f6359f9
 
 
 
 
 
 
 
 
 
 
 
 
1122e93
f6359f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import streamlit as st
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the Sentence Transformer and T5 model
@st.cache(allow_output_mutation=True)
def load_models():
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    qa_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")
    tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
    return embedding_model, qa_model, tokenizer

embedding_model, qa_model, tokenizer = load_models()

# Upload and load the CSV file
st.title("Economics & Population Advisor")
uploaded_file = st.file_uploader("Upload your CSV file with economic documents", type=["csv"])

if uploaded_file is not None:
    # Load CSV with error handling
    df = pd.read_csv(uploaded_file, on_bad_lines='skip', engine='python')
    st.write("Dataset Preview:", df.head())

    # Allow user to specify the column containing the text (economic documents)
    text_column = st.text_input("Specify the column containing the document text:", value="Country Name")

    if text_column not in df.columns:
        st.error(f"The column '{text_column}' was not found in the dataset.")
    else:
        # Extract documents from the specified column
        documents = df[text_column].tolist()

        # Create embeddings for FAISS indexing
        st.write("Indexing documents...")
        embeddings = embedding_model.encode(documents, convert_to_numpy=True)
        dimension = embeddings.shape[1]
        
        # Create a FAISS index and add embeddings
        index = faiss.IndexFlatL2(dimension)
        index.add(np.array(embeddings, dtype=np.float32))
        st.write("Indexing complete.")

        # Function to generate summary using T5 model
        def generate_summary(context):
            inputs = tokenizer("summarize: " + context, return_tensors="pt", max_length=512, truncation=True)
            outputs = qa_model.generate(inputs["input_ids"], max_length=150, min_length=50, length_penalty=2.0)
            return tokenizer.decode(outputs[0], skip_special_tokens=True)

        # RAG functionality: Ask a question, retrieve documents, and generate an answer
        st.subheader("Ask a Question about Economic Data")
        question = st.text_input("Enter your question:")

        if st.button("Get Answer") and question:
            # Embed the question
            question_embedding = embedding_model.encode([question], convert_to_numpy=True)

            # Search for the most relevant documents
            D, I = index.search(np.array(question_embedding, dtype=np.float32), k=3)
            retrieved_docs = [documents[i] for i in I[0]]

            # Combine retrieved documents into context
            context = " ".join(retrieved_docs[:5])  # Limit to 5 documents to avoid long input
            if len(context) > 1000:  # Truncate context if too long
                context = context[:1000]

            # Generate summary using the context
            answer = generate_summary(context)
            st.write("Answer:", answer)