Spaces:
Runtime error
Runtime error
raannakasturi
commited on
Update generate_markdown.py
Browse files- generate_markdown.py +74 -74
generate_markdown.py
CHANGED
@@ -1,75 +1,75 @@
|
|
1 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
2 |
-
from langchain_community.document_loaders import PyPDFLoader
|
3 |
-
from llama_cpp import Llama
|
4 |
-
|
5 |
-
def load_llm_model():
|
6 |
-
try:
|
7 |
-
llm = Llama(
|
8 |
-
model_path="Llama-3.2-1B-Instruct-Q8_0.gguf",
|
9 |
-
n_gpu_layers = -1,
|
10 |
-
n_ctx=100000,
|
11 |
-
n_batch=4096,
|
12 |
-
)
|
13 |
-
print("LLM model loaded successfully")
|
14 |
-
return llm
|
15 |
-
except Exception as e:
|
16 |
-
print(f"Error loading LLM model: {e}")
|
17 |
-
raise
|
18 |
-
|
19 |
-
def get_text_from_pdf(file):
|
20 |
-
loader = PyPDFLoader(file)
|
21 |
-
pages = loader.load_and_split()
|
22 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
|
23 |
-
texts = text_splitter.split_documents(pages)
|
24 |
-
final_text = ""
|
25 |
-
for text in texts:
|
26 |
-
if text.page_content.startswith("REFERENCES"):
|
27 |
-
break
|
28 |
-
else:
|
29 |
-
final_text = final_text + text.page_content
|
30 |
-
research_paper = ""
|
31 |
-
for text in final_text:
|
32 |
-
if text.startswith(("REFERENCES", "REFERENCESREFERENCES", "REFERENCESREFERENCESREFERENCES")):
|
33 |
-
break
|
34 |
-
else:
|
35 |
-
research_paper = research_paper + text
|
36 |
-
return research_paper[:10000]
|
37 |
-
|
38 |
-
def generate_prompt(research_paper):
|
39 |
-
prompt = f'''
|
40 |
-
As a text script expert, please help me to write a short text script with the topic \\"{research_paper}\\".Your output should only and strictly use the following template:\\n# {{Title}}\\n## {{Subtitle01}}\\n- {{Emoji01}} Bulletpoint01\\n- {{Emoji02}} Bulletpoint02\\n## {{Subtitle02}}\\n- {{Emoji03}} Bulletpoint03\\n- {{Emoji04}} Bulletpoint04\\n\\nSummarize the giving topic to generate a mind map (as many subtitles as possible, with a minimum of three subtitles) structure markdown.\\n Do not include anything in the response, that is not the part of mindmap.\\n Importantly your output must use language \\"English\\""
|
41 |
-
'''
|
42 |
-
return prompt
|
43 |
-
|
44 |
-
def generate_mindmap_structure(llm, prompt):
|
45 |
-
response = llm.create_chat_completion(
|
46 |
-
messages = [
|
47 |
-
{'role':'system',
|
48 |
-
'content': 'You are a helpful research assistant for generating well-formatted mindmaps in MarkDown format from scientific research papers.'},
|
49 |
-
{'role':'user',
|
50 |
-
'content': prompt}
|
51 |
-
],
|
52 |
-
temperature=0.7,
|
53 |
-
top_k=200,
|
54 |
-
top_p=3.0,
|
55 |
-
)
|
56 |
-
mindmap_data = response['choices'][0]['message']['content']
|
57 |
-
return mindmap_data
|
58 |
-
|
59 |
-
def generate_markdown(llm, file):
|
60 |
-
final_text = get_text_from_pdf(file)
|
61 |
-
prompt = generate_prompt(final_text)
|
62 |
-
mindmap_markdown = generate_mindmap_structure(llm, prompt)
|
63 |
-
if "**" in mindmap_markdown:
|
64 |
-
mindmap_markdown = mindmap_markdown.replace("- **", "### ")
|
65 |
-
mindmap_markdown = mindmap_markdown.replace("**", "")
|
66 |
-
else:
|
67 |
-
pass
|
68 |
-
return mindmap_markdown
|
69 |
-
|
70 |
-
def sanitize_markdown(llm, mindmap_markdown):
|
71 |
-
prompt = f'''
|
72 |
-
As an experienced coder and programmer, help me convert the text \\"{mindmap_markdown}\\" into a well-formatted markdown. Your output should only and strictly use the following template:\\n# {{Title}}\\n## {{Subtitle01}}\\n- {{Emoji01}} Bulletpoint01\\n- {{Emoji02}} Bulletpoint02\\n## {{Subtitle02}}\\n- {{Emoji03}} Bulletpoint03\\n- {{Emoji04}} Bulletpoint04\\n\\nDo not include anything in the response, that is not the part of mindmap."
|
73 |
-
'''
|
74 |
-
sanitized_markdown = generate_mindmap_structure(llm, prompt)
|
75 |
return sanitized_markdown
|
|
|
1 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
2 |
+
from langchain_community.document_loaders import PyPDFLoader
|
3 |
+
from llama_cpp import Llama
|
4 |
+
|
5 |
+
def load_llm_model():
|
6 |
+
try:
|
7 |
+
llm = Llama(
|
8 |
+
model_path="/home/user/app/Llama-3.2-1B-Instruct-Q8_0.gguf",
|
9 |
+
n_gpu_layers = -1,
|
10 |
+
n_ctx=100000,
|
11 |
+
n_batch=4096,
|
12 |
+
)
|
13 |
+
print("LLM model loaded successfully")
|
14 |
+
return llm
|
15 |
+
except Exception as e:
|
16 |
+
print(f"Error loading LLM model: {e}")
|
17 |
+
raise
|
18 |
+
|
19 |
+
def get_text_from_pdf(file):
|
20 |
+
loader = PyPDFLoader(file)
|
21 |
+
pages = loader.load_and_split()
|
22 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
|
23 |
+
texts = text_splitter.split_documents(pages)
|
24 |
+
final_text = ""
|
25 |
+
for text in texts:
|
26 |
+
if text.page_content.startswith("REFERENCES"):
|
27 |
+
break
|
28 |
+
else:
|
29 |
+
final_text = final_text + text.page_content
|
30 |
+
research_paper = ""
|
31 |
+
for text in final_text:
|
32 |
+
if text.startswith(("REFERENCES", "REFERENCESREFERENCES", "REFERENCESREFERENCESREFERENCES")):
|
33 |
+
break
|
34 |
+
else:
|
35 |
+
research_paper = research_paper + text
|
36 |
+
return research_paper[:10000]
|
37 |
+
|
38 |
+
def generate_prompt(research_paper):
|
39 |
+
prompt = f'''
|
40 |
+
As a text script expert, please help me to write a short text script with the topic \\"{research_paper}\\".Your output should only and strictly use the following template:\\n# {{Title}}\\n## {{Subtitle01}}\\n- {{Emoji01}} Bulletpoint01\\n- {{Emoji02}} Bulletpoint02\\n## {{Subtitle02}}\\n- {{Emoji03}} Bulletpoint03\\n- {{Emoji04}} Bulletpoint04\\n\\nSummarize the giving topic to generate a mind map (as many subtitles as possible, with a minimum of three subtitles) structure markdown.\\n Do not include anything in the response, that is not the part of mindmap.\\n Importantly your output must use language \\"English\\""
|
41 |
+
'''
|
42 |
+
return prompt
|
43 |
+
|
44 |
+
def generate_mindmap_structure(llm, prompt):
|
45 |
+
response = llm.create_chat_completion(
|
46 |
+
messages = [
|
47 |
+
{'role':'system',
|
48 |
+
'content': 'You are a helpful research assistant for generating well-formatted mindmaps in MarkDown format from scientific research papers.'},
|
49 |
+
{'role':'user',
|
50 |
+
'content': prompt}
|
51 |
+
],
|
52 |
+
temperature=0.7,
|
53 |
+
top_k=200,
|
54 |
+
top_p=3.0,
|
55 |
+
)
|
56 |
+
mindmap_data = response['choices'][0]['message']['content']
|
57 |
+
return mindmap_data
|
58 |
+
|
59 |
+
def generate_markdown(llm, file):
|
60 |
+
final_text = get_text_from_pdf(file)
|
61 |
+
prompt = generate_prompt(final_text)
|
62 |
+
mindmap_markdown = generate_mindmap_structure(llm, prompt)
|
63 |
+
if "**" in mindmap_markdown:
|
64 |
+
mindmap_markdown = mindmap_markdown.replace("- **", "### ")
|
65 |
+
mindmap_markdown = mindmap_markdown.replace("**", "")
|
66 |
+
else:
|
67 |
+
pass
|
68 |
+
return mindmap_markdown
|
69 |
+
|
70 |
+
def sanitize_markdown(llm, mindmap_markdown):
|
71 |
+
prompt = f'''
|
72 |
+
As an experienced coder and programmer, help me convert the text \\"{mindmap_markdown}\\" into a well-formatted markdown. Your output should only and strictly use the following template:\\n# {{Title}}\\n## {{Subtitle01}}\\n- {{Emoji01}} Bulletpoint01\\n- {{Emoji02}} Bulletpoint02\\n## {{Subtitle02}}\\n- {{Emoji03}} Bulletpoint03\\n- {{Emoji04}} Bulletpoint04\\n\\nDo not include anything in the response, that is not the part of mindmap."
|
73 |
+
'''
|
74 |
+
sanitized_markdown = generate_mindmap_structure(llm, prompt)
|
75 |
return sanitized_markdown
|