from haystack.document_stores import InMemoryDocumentStore import pandas as pd import gradio as gr df=pd.read_parquet('df.parquet') candidats=pd.read_parquet('candidats.parquet') document_store = InMemoryDocumentStore(use_bm25=True) docs=df.drop_duplicates(subset=['fileclean']).rename(columns={'fileclean':'content'}).to_dict(orient='records') document_store.write_documents(docs) from haystack.nodes import BM25Retriever retriever = BM25Retriever(document_store=document_store) from haystack.pipelines import DocumentSearchPipeline pipeline = DocumentSearchPipeline(retriever=retriever) def semanticsearch(query): result = pipeline.run( query=query, params={ "Retriever": { "top_k": 10 } },debug=False ) results=[] for document in result['documents']: result=document.to_dict() for c in ['content_type','embedding','id']: result.pop(c) results.append(result) results=pd.DataFrame(results) return results demo = gr.Interface( semanticsearch, [ gr.Dropdown(candidats.sort_values(by='text').text.tolist()), ], [gr.Dataframe()] ) if __name__ == "__main__": demo.launch()