File size: 6,459 Bytes
abe8f23
69c0b63
 
 
 
 
aafae3c
 
69c0b63
abe8f23
69c0b63
aafae3c
 
 
 
c3e4f4a
aafae3c
 
 
 
 
69c0b63
 
 
abe8f23
 
 
69c0b63
 
abe8f23
69c0b63
 
abe8f23
aafae3c
 
 
abe8f23
69c0b63
aafae3c
69c0b63
 
aafae3c
 
 
 
 
 
 
 
 
69c0b63
abe8f23
aafae3c
69c0b63
 
 
 
 
 
 
 
 
 
aafae3c
 
 
 
 
 
69c0b63
aafae3c
69c0b63
aafae3c
69c0b63
 
 
 
 
aafae3c
 
 
 
69c0b63
aafae3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69c0b63
 
 
aafae3c
69c0b63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aafae3c
 
69c0b63
 
 
 
 
aafae3c
 
69c0b63
 
 
 
 
aafae3c
 
 
 
 
 
 
69c0b63
aafae3c
69c0b63
 
aafae3c
 
69c0b63
aafae3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abe8f23
 
69c0b63
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import streamlit as st
import os
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    Settings,
    PromptTemplate,
    QueryBundle,
)
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.core import get_response_synthesizer
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.query.query_transform import HyDEQueryTransform
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core import load_index_from_storage
from llama_index.core import StorageContext
from llama_index.core.retrievers import QueryFusionRetriever
from dotenv import load_dotenv
import logging
import google.generativeai as genai
from pathlib import Path

load_dotenv()

# Set logging level
logging.basicConfig(level=logging.INFO)

# Configure Gemini Pro
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

model_gemini_pro_vision = "gemini-pro-vision"
model_gemini_pro = "gemini-pro"


# Configure Gemini models
Settings.llm = Gemini(model=model_gemini_pro, api_key=os.getenv("GOOGLE_API_KEY"))
Settings.embed_model = GeminiEmbedding(
    model_name="models/embedding-001",
    api_key=os.getenv("GOOGLE_API_KEY")
)


# Function to create a Semantic Splitter Node Parser
def create_semantic_splitter_node_parser():
    """Creates a semantic splitter."""
    return SemanticSplitterNodeParser(
        buffer_size=1, breakpoint_percentile_threshold=95, embed_model=Settings.embed_model
    )


def load_and_index_pdf(pdf_path):
    """Loads and index the pdf.
    
    Args : 
    pdf_path (str) : The path to the pdf file
    
    Returns : 
    index (llama_index.core.VectorStoreIndex): The vector index
    """
    try:
         logging.info(f"Loading PDF document from: {pdf_path}")
         documents = SimpleDirectoryReader(input_files=[pdf_path]).load_data()
         if documents:
            logging.info("Creating semantic splitter")
            node_parser = create_semantic_splitter_node_parser()
            nodes = node_parser.get_nodes_from_documents(documents)
            logging.info("Creating vector store index")
            index = VectorStoreIndex(nodes=nodes)
            return index
         else:
            logging.warning("No documents found in the PDF")
            return None
    except Exception as e:
        logging.error(f"Error loading and indexing PDF: {e}")
        return None


def create_rag_pipeline(index):
    """Creates a RAG pipeline for translation.
    
     Args : 
    index (llama_index.core.VectorStoreIndex): The vector index.
    
    Returns : 
    query_engine(llama_index.core.query_engine.RetrieverQueryEngine): The query engine
    """

    logging.info("Initializing RAG Pipeline components")
    # setup retriever

    retriever = VectorStoreIndex(
            index.nodes,
            ).as_retriever(similarity_top_k=5)


     # setup query transformer
    hyde_query_transform = HyDEQueryTransform(llm=Settings.llm)

      # setup reranker
    reranker = SentenceTransformerRerank(top_n=3, model="BAAI/bge-reranker-base")

    # response_synthesizer
    response_synthesizer = get_response_synthesizer(
            response_mode="refine",
        )
    
    # setup query engine
    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer,
        node_postprocessors=[reranker],
        query_transform= hyde_query_transform
    )

    logging.info("RAG Pipeline is configured.")
    return query_engine

def translate_text(french_text, query_engine):
    """Translates french text to Yipunu using a highly optimized RAG.
    
    Args : 
    french_text (str): The french text to translate.
    query_engine (llama_index.core.query_engine.RetrieverQueryEngine): The query engine.
    
    Returns:
     (str): The yipunu translation or an error message.
    """

    try:
        logging.info(f"Initiating translation of: {french_text}")

        template = (
            "Tu es un excellent traducteur du français vers le yipunu. Tu traduis le texte sans donner d'explication. "
            "Texte: {french_text} "
            "Traduction:"
        )

        prompt_template = PromptTemplate(template)
        query_bundle = QueryBundle(french_text, custom_prompt=prompt_template)
        response = query_engine.query(query_bundle)
        logging.info(f"Translation Result: {response.response}")
        return response.response
    except Exception as e:
        logging.error(f"Error during translation: {e}")
        return f"Error during translation: {str(e)}"
    


def main():
    """Main function for streamlit app."""
    
    st.title("French to Yipunu Translation App")
    
    # Construct the path to the PDF in the data folder
    default_pdf_path = Path("data/parlons_yipunu.pdf")
    
    # Check if the default pdf_file exists.
    if default_pdf_path.exists():
        index = load_and_index_pdf(str(default_pdf_path))
        if index:
            query_engine = create_rag_pipeline(index)
            french_text = st.text_area("Enter French Text:", "Ni vosi yipunu")
            if st.button("Translate"):
                translation = translate_text(french_text, query_engine)
                st.success(f"Yipunu Translation: {translation}")
    else:
            # PDF File Upload
        uploaded_file = st.file_uploader("Upload a PDF file containing the Punu grammar:", type="pdf")
        if uploaded_file is not None:
            # Save file to a temporary location
            temp_file_path = Path("temp_file.pdf")
            with open(temp_file_path, "wb") as f:
                f.write(uploaded_file.read())

            index = load_and_index_pdf(str(temp_file_path))
            if index:
                query_engine = create_rag_pipeline(index)
                french_text = st.text_area("Enter French Text:", "Ni vosi yipunu")
                if st.button("Translate"):
                    translation = translate_text(french_text, query_engine)
                    st.success(f"Yipunu Translation: {translation}")

            # Clean up temp files
            os.remove(temp_file_path)
        else:
            st.info("Please upload a pdf containing the punu grammar.")
    


if __name__ == "__main__":
    main()