from datasets import load_dataset from llama_index.core import VectorStoreIndex, Document from llama_index.core.indices.query.query_transform.base import ( HyDEQueryTransform, ) from llama_index.core.query_engine import TransformQueryEngine from llama_index.core import Settings from llama_index.embeddings.openai import OpenAIEmbedding import gradio as gr Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-3-small") # dataset=load_dataset("davidr70/megillah_english_sugyot", split="train") dataset=load_dataset("davidr70/megilla_sugyot_merged", split="train") documents = [Document(text=item['content'], metadata=item['metadata']) for item in dataset] # hyde = HyDEQueryTransform(include_original=True) #documents = SimpleDirectoryReader("data").load_data() index = VectorStoreIndex.from_documents(documents) retriever = index.as_retriever( similarity_top_k=7, # Number of hits to return vector_store_query_mode="default" # Basic semantic search ) def ask(question): nodes = retriever.retrieve(question) full_result = "" for node in nodes: output = f"score: {str(node.score)}\nmetadata: {str(node.metadata)}\ntext: {node.text}\n\n\n" full_result += output return full_result with gr.Blocks(title="Megillah Search") as demo: gr.Markdown("# Megillah Search") gr.Markdown("Search through the Megillah dataset") question = gr.Textbox(label="Question", placeholder="Ask a question about Megillah...") submit_btn = gr.Button("Search") answer = gr.Textbox(label="Sources", lines=20) submit_btn.click(fn=ask, inputs=question, outputs=answer) question.submit(fn=ask, inputs=question, outputs=answer) demo.launch(share=True)