File size: 6,460 Bytes
abe8f23 69c0b63 aafae3c 69c0b63 abe8f23 69c0b63 aafae3c c3e4f4a aafae3c 69c0b63 abe8f23 69c0b63 abe8f23 69c0b63 abe8f23 aafae3c abe8f23 69c0b63 9a09dcf 69c0b63 aafae3c 69c0b63 abe8f23 aafae3c 69c0b63 aafae3c 69c0b63 aafae3c 69c0b63 aafae3c 69c0b63 aafae3c 69c0b63 aafae3c 69c0b63 aafae3c 69c0b63 aafae3c 69c0b63 aafae3c 69c0b63 aafae3c 69c0b63 aafae3c 69c0b63 aafae3c 69c0b63 aafae3c abe8f23 69c0b63 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
import streamlit as st
import os
from llama_index.core import (
VectorStoreIndex,
SimpleDirectoryReader,
Settings,
PromptTemplate,
QueryBundle,
)
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.core import get_response_synthesizer
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.query.query_transform import HyDEQueryTransform
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core import load_index_from_storage
from llama_index.core import StorageContext
from llama_index.core.retrievers import QueryFusionRetriever
from dotenv import load_dotenv
import logging
import google.generativeai as genai
from pathlib import Path
load_dotenv()
# Set logging level
logging.basicConfig(level=logging.INFO)
# Configure Gemini Pro
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
model_gemini_pro_vision = "gemini-pro-vision"
model_gemini_pro = "gemini-pro"
# Configure Gemini models
Settings.llm = Gemini(models=model_gemini_pro, api_key=os.getenv("GOOGLE_API_KEY"))
Settings.embed_model = GeminiEmbedding(
model_name="models/embedding-001",
api_key=os.getenv("GOOGLE_API_KEY")
)
# Function to create a Semantic Splitter Node Parser
def create_semantic_splitter_node_parser():
"""Creates a semantic splitter."""
return SemanticSplitterNodeParser(
buffer_size=1, breakpoint_percentile_threshold=95, embed_model=Settings.embed_model
)
def load_and_index_pdf(pdf_path):
"""Loads and index the pdf.
Args :
pdf_path (str) : The path to the pdf file
Returns :
index (llama_index.core.VectorStoreIndex): The vector index
"""
try:
logging.info(f"Loading PDF document from: {pdf_path}")
documents = SimpleDirectoryReader(input_files=[pdf_path]).load_data()
if documents:
logging.info("Creating semantic splitter")
node_parser = create_semantic_splitter_node_parser()
nodes = node_parser.get_nodes_from_documents(documents)
logging.info("Creating vector store index")
index = VectorStoreIndex(nodes=nodes)
return index
else:
logging.warning("No documents found in the PDF")
return None
except Exception as e:
logging.error(f"Error loading and indexing PDF: {e}")
return None
def create_rag_pipeline(index):
"""Creates a RAG pipeline for translation.
Args :
index (llama_index.core.VectorStoreIndex): The vector index.
Returns :
query_engine(llama_index.core.query_engine.RetrieverQueryEngine): The query engine
"""
logging.info("Initializing RAG Pipeline components")
# setup retriever
retriever = VectorStoreIndex(
index.nodes,
).as_retriever(similarity_top_k=5)
# setup query transformer
hyde_query_transform = HyDEQueryTransform(llm=Settings.llm)
# setup reranker
reranker = SentenceTransformerRerank(top_n=3, model="BAAI/bge-reranker-base")
# response_synthesizer
response_synthesizer = get_response_synthesizer(
response_mode="refine",
)
# setup query engine
query_engine = RetrieverQueryEngine(
retriever=retriever,
response_synthesizer=response_synthesizer,
node_postprocessors=[reranker],
query_transform= hyde_query_transform
)
logging.info("RAG Pipeline is configured.")
return query_engine
def translate_text(french_text, query_engine):
"""Translates french text to Yipunu using a highly optimized RAG.
Args :
french_text (str): The french text to translate.
query_engine (llama_index.core.query_engine.RetrieverQueryEngine): The query engine.
Returns:
(str): The yipunu translation or an error message.
"""
try:
logging.info(f"Initiating translation of: {french_text}")
template = (
"Tu es un excellent traducteur du français vers le yipunu. Tu traduis le texte sans donner d'explication. "
"Texte: {french_text} "
"Traduction:"
)
prompt_template = PromptTemplate(template)
query_bundle = QueryBundle(french_text, custom_prompt=prompt_template)
response = query_engine.query(query_bundle)
logging.info(f"Translation Result: {response.response}")
return response.response
except Exception as e:
logging.error(f"Error during translation: {e}")
return f"Error during translation: {str(e)}"
def main():
"""Main function for streamlit app."""
st.title("French to Yipunu Translation App")
# Construct the path to the PDF in the data folder
default_pdf_path = Path("data/parlons_yipunu.pdf")
# Check if the default pdf_file exists.
if default_pdf_path.exists():
index = load_and_index_pdf(str(default_pdf_path))
if index:
query_engine = create_rag_pipeline(index)
french_text = st.text_area("Enter French Text:", "Ni vosi yipunu")
if st.button("Translate"):
translation = translate_text(french_text, query_engine)
st.success(f"Yipunu Translation: {translation}")
else:
# PDF File Upload
uploaded_file = st.file_uploader("Upload a PDF file containing the Punu grammar:", type="pdf")
if uploaded_file is not None:
# Save file to a temporary location
temp_file_path = Path("temp_file.pdf")
with open(temp_file_path, "wb") as f:
f.write(uploaded_file.read())
index = load_and_index_pdf(str(temp_file_path))
if index:
query_engine = create_rag_pipeline(index)
french_text = st.text_area("Enter French Text:", "Ni vosi yipunu")
if st.button("Translate"):
translation = translate_text(french_text, query_engine)
st.success(f"Yipunu Translation: {translation}")
# Clean up temp files
os.remove(temp_file_path)
else:
st.info("Please upload a pdf containing the punu grammar.")
if __name__ == "__main__":
main() |