singhvaibhav924 commited on
Commit
3959f99
·
1 Parent(s): 9117519

Literature Review Production Done

Browse files
Files changed (4) hide show
  1. Dockerfile +11 -0
  2. app.py +53 -0
  3. helper.py +159 -0
  4. requirements.txt +0 -0
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ COPY . .
10
+
11
+ CMD ["uvicorrn", "-b", "0.0.0.0:7860", "app:app"]
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uvicorn
2
+ import helper
3
+ from fastapi import FastAPI
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+ from pydantic import BaseModel
6
+ from dotenv import load_dotenv
7
+
8
+ load_dotenv()
9
+ app = FastAPI()
10
+ app.add_middleware(
11
+ CORSMiddleware,
12
+ allow_origins=["*"],
13
+ allow_credentials=True,
14
+ allow_methods=["GET", "POST"],
15
+ allow_headers=["*"]
16
+ )
17
+
18
+ llms = None
19
+ base_prompt = "You will be provided with an abstract of a scientific document and other references papers in triple quotes. Your task is to write the related work section of the document using only the provided abstracts and other references papers. Please write the related work section creating a cohesive storyline by doing a critical analysis of prior work comparing the strengths and weaknesses while also motivating the proposed approach. You are also provided a sentence plan mentioning the total number of lines and the citations to refer in different lines. You should cite all the other related documents as [#] whenever you are referring it in the related work. Do not cite abstract. Do not include any extra notes or newline characters at the end. Do not copy the abstracts of reference papers directly but compare and contrast to the main work concisely. Do not provide the output in bullet points. Do not provide references at the end. Please cite all the provided reference papers. Please follow the plan when generating sentences, especially the number of lines to generate."
20
+ sentence_plan = "1. Introduction sentence\n2. Overview of relevant studies\n3. Detailed discussion on key papers\n4. Summary of related work\n"
21
+
22
+ class RequestData(BaseModel):
23
+ abstract: str
24
+
25
+ class ResponseData(BaseModel):
26
+ summary: str
27
+
28
+ @app.post("/generateLiteratureSurvey/", response_model=ResponseData)
29
+ async def generate_literature_survey(request_data: RequestData):
30
+ summary, ids = summarize(request_data.abstract, llms)
31
+ return {"summary": summary,
32
+ "ids": ids
33
+ }
34
+
35
+ @app.get("/")
36
+ async def root():
37
+ if llms == None :
38
+ return {"status": 0}
39
+ return {"status": 1}
40
+
41
+ def summarize(query, llms) :
42
+ keywords = helper.extract_keywords(llms['feature_extractor'], query)
43
+ papers = helper.search_papers(llms['arxiv_agent'], keywords)
44
+ ranked_papers = helper.re_rank_papers(llms['ranker'], query, papers)
45
+ literature_review, ids = helper.generate_related_work(llms['summarizer'], llms['summarizer_tokenizer'], query, ranked_papers, base_prompt, sentence_plan)
46
+ return literature_review, ids
47
+
48
+ print("Program running")
49
+ llms = helper.init_pipeline()
50
+ print('Model loaded')
51
+
52
+ if __name__ == '__main__':
53
+ uvicorn.run(app)
helper.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TokenClassificationPipeline, AutoModelForTokenClassification, pipeline
2
+ from langchain_community.utilities import ArxivAPIWrapper
3
+ from transformers.pipelines import AggregationStrategy
4
+ from sentence_transformers import SentenceTransformer
5
+ import arxiv
6
+ import numpy as np
7
+ import torch
8
+
9
+ summarizer_model_name = "microsoft/Phi-3-mini-4k-instruct"
10
+ feature_extractor_model_name = "ml6team/keyphrase-extraction-kbir-inspec"
11
+ ranker_model_name = "sentence-transformers/all-MiniLM-L6-v2"
12
+
13
+ class KeyphraseExtractionPipeline(TokenClassificationPipeline):
14
+ def __init__(self, model, *args, **kwargs):
15
+ super().__init__(
16
+ model=AutoModelForTokenClassification.from_pretrained(model),
17
+ tokenizer=AutoTokenizer.from_pretrained(model),
18
+ *args,
19
+ **kwargs
20
+ )
21
+
22
+ def postprocess(self, all_outputs):
23
+ results = super().postprocess(
24
+ all_outputs=all_outputs,
25
+ aggregation_strategy=AggregationStrategy.SIMPLE,
26
+ )
27
+ return np.unique([result.get("word").strip() for result in results])
28
+
29
+ def init_pipeline() :
30
+ summarizer_model = AutoModelForCausalLM.from_pretrained(
31
+ summarizer_model_name,
32
+ device_map="cuda",
33
+ torch_dtype=torch.float16,
34
+ trust_remote_code=True,
35
+ )
36
+ summarizer_tokenizer = AutoTokenizer.from_pretrained(summarizer_model_name)
37
+
38
+ feature_extractor_model = KeyphraseExtractionPipeline(model=feature_extractor_model_name)
39
+
40
+ ranker_model=SentenceTransformer(ranker_model_name)
41
+
42
+ arxiv_agent = ArxivAPIWrapper(top_k_results = 5, doc_content_chars_max = None, load_max_docs = 10)
43
+ return {
44
+ "summarizer" : summarizer_model,
45
+ "summarizer_tokenizer" : summarizer_tokenizer,
46
+ "feature_extractor" : feature_extractor_model,
47
+ "ranker" : ranker_model,
48
+ "arxiv_agent" : arxiv_agent
49
+ }
50
+
51
+ def extract_keywords(model, abstract):
52
+ keyphrases = model(abstract)
53
+ print(keyphrases)
54
+ return keyphrases
55
+
56
+
57
+ def search_papers(arxiv_agent, keywords):
58
+ query = " ".join(keywords)
59
+ results = arxiv_agent.get_summaries_as_docs(query)
60
+ #print("arxiv ouptut ")
61
+ #print(results)
62
+ return results
63
+
64
+ def re_rank_papers(model, query_abstract, papers):
65
+ summaries = {paper.page_content : {"Title":paper.metadata['Title']} for paper in papers}
66
+ print(summaries)
67
+ target_embeddings = model.encode([query_abstract])
68
+ summaries_embeddings = model.encode(list(summaries.keys()))
69
+
70
+ cosine_similarities = -torch.nn.functional.cosine_similarity(target_embeddings, summaries_embeddings)
71
+ cosine_similarities = cosine_similarities.tolist()
72
+
73
+ i = 0
74
+ for key in summaries.keys() :
75
+ summaries[key]["score"] = cosine_similarities[i]
76
+ i+=1
77
+ return dict(sorted(summaries.items(), key=lambda x: x[1]["score"], reverse=True))
78
+
79
+ def format_abstracts_as_references(papers):
80
+ cite_text = ""
81
+ i = 0
82
+ for key in papers.keys() :
83
+ citation = f"{i+1}"
84
+ cite_text = f"{cite_text}[{citation}]: {key}\n"
85
+ i+=1
86
+ return cite_text
87
+
88
+ def format_authors(authors):
89
+ formatted_authors = []
90
+ for author in authors:
91
+ name_parts = author.name.split()
92
+ last_name = name_parts[-1]
93
+ initials = ''.join([name[0] for name in name_parts[:-1]])
94
+ formatted_authors.append(f"{last_name} {initials}")
95
+ return ', '.join(formatted_authors)
96
+
97
+ def to_vancouver_style(entry):
98
+ authors = format_authors(entry.authors)
99
+ title = entry.title
100
+ journal = 'arXiv'
101
+ year = entry.published.year
102
+ arxiv_id = entry.get_short_id()
103
+ return f"{authors}. {title}. {journal}. {year}. arXiv:{arxiv_id}"
104
+
105
+ def generate_refs(papers) :
106
+ client = arxiv.Client()
107
+ results = []
108
+ for key in papers.keys() :
109
+ search = arxiv.Search(
110
+ query = papers[key]["Title"],
111
+ max_results = 1,
112
+ sort_by = arxiv.SortCriterion.Relevance
113
+ )
114
+ results.append(list(client.results(search))[0])
115
+
116
+ references = [to_vancouver_style(entry) for entry in results]
117
+ ids = [entry.get_short_id() for entry in results]
118
+ i = 0
119
+ refs = "\n\nReferences:\n"
120
+ for reference in references:
121
+ refs = f"{refs}[{i+1}] {reference}\n"
122
+ i+=1
123
+ return refs, ids
124
+
125
+ def generate_related_work(model, tokenizer, query_abstract, ranked_papers, base_prompt, sentence_plan):
126
+ input_text = f"Abstract: {query_abstract}\n"
127
+ i = 1
128
+ for key in ranked_papers.keys():
129
+ input_text += f"{i+1}. {ranked_papers[key]['Title']} - {key}\n"
130
+ i+=1
131
+
132
+ data = f"Abstract: {query_abstract} \n {format_abstracts_as_references(ranked_papers)} \n Plan: {sentence_plan}"
133
+ complete_prompt = f"{base_prompt}\n```{data}```"
134
+ messages = [
135
+ {"role": "system", "content": "You are a helpful AI assistant."},
136
+ {"role": "user", "content": complete_prompt}]
137
+
138
+ pipe = pipeline(
139
+ "text-generation",
140
+ model=model,
141
+ tokenizer=tokenizer,
142
+ )
143
+
144
+ generation_args = {
145
+ "max_new_tokens": 1600,
146
+ "return_full_text": False,
147
+ "temperature": 0.0,
148
+ "do_sample": False,
149
+ }
150
+
151
+ output = pipe(messages, **generation_args)
152
+ print(output)
153
+ related_work = output[0]['generated_text']
154
+ refs, ids = generate_refs(ranked_papers)
155
+ related_work += refs
156
+ f = open("literature review.txt", "w")
157
+ f.write(related_work)
158
+ f.close()
159
+ return related_work, ids
requirements.txt ADDED
Binary file (282 Bytes). View file