SarthakBhatore commited on
Commit
5016268
·
verified ·
1 Parent(s): 18b37d7

Upload 6 files

Browse files
helpers/__init__ ADDED
File without changes
helpers/__pycache__/df_helpers.cpython-312.pyc ADDED
Binary file (3.92 kB). View file
 
helpers/__pycache__/prompts.cpython-312.pyc ADDED
Binary file (4.03 kB). View file
 
helpers/df_helpers.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
+ import pandas as pd
3
+ import numpy as np
4
+ from .prompts import extractConcepts
5
+ from .prompts import graphPrompt
6
+
7
+
8
+ def documents2Dataframe(documents) -> pd.DataFrame:
9
+ rows = []
10
+ for chunk in documents:
11
+ row = {
12
+ "text": chunk.page_content,
13
+ **chunk.metadata,
14
+ "chunk_id": uuid.uuid4().hex,
15
+ }
16
+ rows = rows + [row]
17
+
18
+ df = pd.DataFrame(rows)
19
+ return df
20
+
21
+
22
+ def df2ConceptsList(dataframe: pd.DataFrame) -> list:
23
+ # dataframe.reset_index(inplace=True)
24
+ results = dataframe.apply(
25
+ lambda row: extractConcepts(
26
+ row.text, {"chunk_id": row.chunk_id, "type": "concept"}
27
+ ),
28
+ axis=1,
29
+ )
30
+ # invalid json results in NaN
31
+ results = results.dropna()
32
+ results = results.reset_index(drop=True)
33
+
34
+ ## Flatten the list of lists to one single list of entities.
35
+ concept_list = np.concatenate(results).ravel().tolist()
36
+ return concept_list
37
+
38
+
39
+ def concepts2Df(concepts_list) -> pd.DataFrame:
40
+ ## Remove all NaN entities
41
+ concepts_dataframe = pd.DataFrame(concepts_list).replace(" ", np.nan)
42
+ concepts_dataframe = concepts_dataframe.dropna(subset=["entity"])
43
+ concepts_dataframe["entity"] = concepts_dataframe["entity"].apply(
44
+ lambda x: x.lower()
45
+ )
46
+
47
+ return concepts_dataframe
48
+
49
+
50
+ def df2Graph(dataframe: pd.DataFrame, model=None) -> list:
51
+ # dataframe.reset_index(inplace=True)
52
+ results = dataframe.apply(
53
+ lambda row: graphPrompt(row.text, {"chunk_id": row.chunk_id}, model), axis=1
54
+ )
55
+ # invalid json results in NaN
56
+ results = results.dropna()
57
+ results = results.reset_index(drop=True)
58
+
59
+ ## Flatten the list of lists to one single list of entities.
60
+ concept_list = np.concatenate(results).ravel().tolist()
61
+ return concept_list
62
+
63
+
64
+ def graph2Df(nodes_list) -> pd.DataFrame:
65
+ ## Remove all NaN entities
66
+ graph_dataframe = pd.DataFrame(nodes_list).replace(" ", np.nan)
67
+ graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"])
68
+ graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: x.lower())
69
+ graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: x.lower())
70
+
71
+ return graph_dataframe
helpers/prompts.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from yachalk import chalk
3
+ sys.path.append("..")
4
+
5
+ import json
6
+ import ollama.client as client
7
+
8
+
9
+ def extractConcepts(prompt: str, metadata={}, model="mistral-openorca:latest"):
10
+ SYS_PROMPT = (
11
+ "Your task is to extract the key concepts (and non-personal entities) mentioned in the given context. "
12
+ "Focus on identifying only the most important and fundamental concepts. If needed, break the concepts down into simpler, more atomic components. "
13
+ "Ensure each concept is clear and distinct. "
14
+ "Categorize each concept into one of the following categories: [event, concept, place, object, document, organisation, condition, misc].\n"
15
+ "For each extracted concept, assess its contextual importance on a scale from 1 to 5, with 5 being the highest level of importance. "
16
+ "Format your output as a JSON list with the following structure:\n"
17
+ "[\n"
18
+ " {\n"
19
+ ' "entity": "The Concept",\n'
20
+ ' "importance": "The contextual importance of the concept on a scale of 1 to 5",\n'
21
+ ' "category": "The Type of Concept",\n'
22
+ " },\n"
23
+ " {\n"
24
+ ' "entity": "Another Concept",\n'
25
+ ' "importance": "The contextual importance of this concept on a scale of 1 to 5",\n'
26
+ ' "category": "The Type of Concept",\n'
27
+ " },\n"
28
+ " // Additional concepts can be added in the same format\n"
29
+ "]\n"
30
+ "Ensure the output is well-structured and free of errors."
31
+ )
32
+
33
+ response, _ = client.generate(model_name=model, system=SYS_PROMPT, prompt=prompt)
34
+ try:
35
+ result = json.loads(response)
36
+ result = [dict(item, **metadata) for item in result]
37
+ except:
38
+ print("\n\nERROR ### Here is the buggy response: ", response, "\n\n")
39
+ result = None
40
+ return result
41
+
42
+
43
+ def graphPrompt(input: str, metadata={}, model="mistral-openorca:latest"):
44
+ if model == None:
45
+ model = "mistral-openorca:latest"
46
+
47
+ # model_info = client.show(model_name=model)
48
+ # print( chalk.blue(model_info))
49
+
50
+ SYS_PROMPT = (
51
+ "You are a network graph maker who extracts terms and their relations from a given context. "
52
+ "You are provided with a context chunk (delimited by ```) Your task is to extract the ontology "
53
+ "of terms mentioned in the given context. These terms should represent the key concepts as per the context. \n"
54
+ "Thought 1: While traversing through each sentence, Think about the key terms mentioned in it.\n"
55
+ "\tTerms may include object, entity, location, organization, person, \n"
56
+ "\tcondition, acronym, documents, service, concept, etc.\n"
57
+ "\tTerms should be as atomistic as possible\n\n"
58
+ "Thought 2: Think about how these terms can have one on one relation with other terms.\n"
59
+ "\tTerms that are mentioned in the same sentence or the same paragraph are typically related to each other.\n"
60
+ "\tTerms can be related to many other terms\n\n"
61
+ "Thought 3: Find out the relation between each such related pair of terms. \n\n"
62
+ "Format your output as a list of json. Each element of the list contains a pair of terms"
63
+ "and the relation between them, like the follwing: \n"
64
+ "[\n"
65
+ " {\n"
66
+ ' "node_1": "A concept from extracted ontology",\n'
67
+ ' "node_2": "A related concept from extracted ontology",\n'
68
+ ' "edge": "relationship between the two concepts, node_1 and node_2 in one or two sentences"\n'
69
+ " }, {...}\n"
70
+ "]"
71
+ )
72
+
73
+ USER_PROMPT = f"context: ```{input}``` \n\n output: "
74
+ response, _ = client.generate(model_name=model, system=SYS_PROMPT, prompt=USER_PROMPT)
75
+ try:
76
+ result = json.loads(response)
77
+ result = [dict(item, **metadata) for item in result]
78
+ except:
79
+ print("\n\nERROR ### Here is the buggy response: ", response, "\n\n")
80
+ result = None
81
+ return result
requirements.txt ADDED
Binary file (4.14 kB). View file