In [None]:
import os

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import VectorDBQA
from langchain.document_loaders import PagedPDFSplitter
from langchain.llms import OpenAI
from langchain import OpenAI, VectorDBQA

In [None]:
file_path = "1706.03762.pdf"

# Load the document

loader = PagedPDFSplitter(file_path)
docs = loader.load()

In [None]:
chroma = Chroma(embedding_function=OpenAIEmbeddings())

In [None]:
chroma.add_documents(docs)

# Text Generation

In [None]:
from langchain.chains import LLMChain
from langchain import PromptTemplate

prompt_template = """
You will be presented with a section of an Arxiv paper. Your job is to write the python + PyTorch code that exactly implements the paper with NO ERRORS.
Additionally, you will be shown previously generated code. You must use this code as a reference and keep variable/function names the same.
Use the context below to write a 400 word blog post about the topic below:
 
 Arxiv paper section: {paper}
 Previous Code: {prev_code}
 Next Code: 
"""

PROMPT = PromptTemplate(
 template=prompt_template, input_variables=["paper", "prev_code"]
)

llm = OpenAI(temperature=0)

chain = LLMChain(llm=llm, prompt=PROMPT)

In [None]:
def generate_code(title, paper, prev_code, **kwargs):
 return chain.apply({"title": title, "paper": paper, "prev_code": prev_code})

In [None]:
with open("main.tex", "r") as f:
 main_tex = f.read()

In [None]:
len(main_tex)

In [None]:
out = generate_code(title="Long Range Language Modeling via Gated State Spaces", prev_code="import torch", paper=main_tex)

# QA

In [None]:
qa = VectorDBQA.from_chain_type(llm=OpenAI(), chain_type="stuff", vectorstore=chroma)

In [None]:
qa.run("What is the purpose of this paper?")

In [None]:
qa.run("What is the main contribution of this paper?")

In [None]:
qa.run("Given the text of the arxiv paper you know about, can you propose a structure for a jupyter notebook that would summarize the papers key contributions and findings? The notebook should be structure in a logical and coherent way, with sections and sub-sections that reflect the papers organization. Only include portions of the paper that are relevant to code -- for example, do not include suggestions for further research or future work. The output should be in this format:\n- each section should be numbered and have a title (e.g. Training and Inference)\n- each subsection should start with a dash (e.g., - Overview of the training process)")

In [None]:
qa.run("For the following arxiv paper sections, can you generate text descriptions and code for a jupyter notebook:\n3. Training\n- Overview of training data and batching\n- hardware and schedule\n- optimizer")

In [None]:
qa.run("What python code that implements this paper.")