import gradio as gr import paperqa import pickle import pandas as pd from pathlib import Path import requests import zipfile import io import tempfile import os css_style = """ .gradio-container { font-family: "IBM Plex Mono"; } """ def request_pathname(files, data, openai_api_key): if files is None: return [[]] for file in files: # make sure we're not duplicating things in the dataset if file.name in [x[0] for x in data]: continue data.append([file.name, None, None]) return [[len(data), 0]], data, data, validate_dataset(pd.DataFrame(data), openai_api_key) def validate_dataset(dataset, openapi): docs_ready = dataset.iloc[-1, 0] != "" if docs_ready and type(openapi) is str and len(openapi) > 0: return "✨Ready✨" elif docs_ready: return "⚠️Waiting for key⚠️" elif type(openapi) is str and len(openapi) > 0: return "⚠️Waiting for documents⚠️" else: return "⚠️Waiting for documents and key⚠️" def make_stats(docs): return [[len(docs.doc_previews), sum([x[0] for x in docs.doc_previews])]] # , progress=gr.Progress()): def do_ask(question, button, openapi, dataset, length, do_marg, k, max_sources, docs): passages = "" docs_ready = dataset.iloc[-1, 0] != "" if button == "✨Ready✨" and type(openapi) is str and len(openapi) > 0 and docs_ready: os.environ['OPENAI_API_KEY'] = openapi.strip() if docs is None: docs = paperqa.Docs() # dataset is pandas dataframe for _, row in dataset.iterrows(): try: docs.add(row['filepath'], row['citation string'], key=row['key'], disable_check=True) yield "", "", "", docs, make_stats(docs) except Exception as e: pass else: yield "", "", "", docs, [[0, 0]] #progress(0, "Building Index...") docs._build_faiss_index() #progress(0.25, "Querying...") for i, result in enumerate(docs.query_gen(question, length_prompt=f'use {length:d} words', marginal_relevance=do_marg, k=k, max_sources=max_sources)): #progress(0.25 + 0.1 * i, "Generating Context" + str(i)) yield result.formatted_answer, result.context, passages, docs, make_stats(docs) #progress(1.0, "Done!") # format the passages for i, (key, passage) in enumerate(result.passages.items()): passages += f'Disabled for now' yield result.formatted_answer, result.context, passages, docs, make_stats(docs) def download_repo(gh_repo, data, openai_api_key, pbar=gr.Progress()): # download zipped version of repo r = requests.get(f'https://api.github.com/repos/{gh_repo}/zipball') if r.status_code == 200: pbar(1, 'Downloaded') # iterate through files in zip with zipfile.ZipFile(io.BytesIO(r.content)) as z: for i, f in enumerate(z.namelist()): # skip directories if f.endswith('/'): continue # try to read as plaintext (skip binary files) try: text = z.read(f).decode('utf-8') except UnicodeDecodeError: continue # check if it's bigger than 100kb or smaller than 10 bytes if len(text) > 1e5 or len(text) < 10: continue # have to save to temporary file so we have a path with tempfile.NamedTemporaryFile(delete=False) as tmp: tmp.write(text.encode('utf-8')) tmp.flush() path = tmp.name # strip off the first directory of f rel_path = '/'.join(f.split('/')[1:]) key = os.path.basename(f) citation = f'[{rel_path}](https://github.com/{gh_repo}/tree/main/{rel_path})' if path in [x[0] for x in data]: continue data.append([path, citation, key]) yield [[len(data), 0]], data, data, validate_dataset(pd.DataFrame(data), openai_api_key) pbar(int((i+1)/len(z.namelist()) * 99), f'Added {f}') pbar(100, 'Done') else: raise ValueError('Unknown Github Repo') return data with gr.Blocks(css=css_style) as demo: docs = gr.State(None) data = gr.State([]) openai_api_key = gr.State('') openai_api_key = gr.Textbox( label="OpenAI API Key", placeholder="sk-...", type="password") with gr.Tab('File Upload'): uploaded_files = gr.File( label="Your Documents Upload (PDF or txt)", file_count="multiple", ) with gr.Accordion("See Docs:", open=False): dataset = gr.Dataframe( headers=["filepath", "citation string", "key"], datatype=["str", "str", "str"], col_count=(3, "fixed"), interactive=False, label="Documents and Citations", overflow_row_behaviour='paginate', max_rows=5 ) buildb = gr.Textbox("⚠️Waiting for documents and key...", label="Status", interactive=False, show_label=True, max_lines=1) stats = gr.Dataframe(headers=['Docs', 'Chunks'], datatype=['number', 'number'], col_count=(2, "fixed"), interactive=False, label="Doc Stats") openai_api_key.change(validate_dataset, inputs=[ dataset, openai_api_key], outputs=[buildb]) dataset.change(validate_dataset, inputs=[ dataset, openai_api_key], outputs=[buildb]) uploaded_files.change(request_pathname, inputs=[ uploaded_files, data, openai_api_key], outputs=[stats, data, dataset, buildb]) query = gr.Textbox( placeholder="Enter your question here...", label="Question") with gr.Row(): length = gr.Slider(25, 200, value=100, step=5, label='Words in answer') marg = gr.Checkbox(True, label='Max marginal relevance') k = gr.Slider(1, 20, value=10, step=1, label='Chunks to examine') sources = gr.Slider(1, 10, value=5, step=1, label='Contexts to include') ask = gr.Button("Ask Question") answer = gr.Markdown(label="Answer") with gr.Accordion("Context", open=True): context = gr.Markdown(label="Context") with gr.Accordion("Raw Text", open=False): passages = gr.Markdown(label="Passages") ask.click(fn=do_ask, inputs=[query, buildb, openai_api_key, dataset, length, marg, k, sources, docs], outputs=[answer, context, passages, docs, stats]) demo.queue(concurrency_count=20) demo.launch(show_error=True)