File size: 4,922 Bytes
6d0c6c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import json
from llm_helper import llm
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.exceptions import OutputParserException
import sqlite3


#def process_posts(raw_file_path, processed_file_path=None):
 #   with open(raw_file_path, encoding='utf-8') as file:
  #      posts = json.load(file)
   #     enriched_posts = []
    #    for post in posts:
     #       metadata = extract_metadata(post['text_blocks'])
      #      post_with_metadata = post | metadata
       #     enriched_posts.append(post_with_metadata)

def process_posts_for_persona(persona_name, processed_file_path=None):
    posts = get_posts_by_persona(persona_name)

    if not posts:
        print(f"No posts found for persona '{persona_name}'.")
        return

    enriched_posts = []
    for post in posts:
        metadata = extract_metadata(post)
        post_with_metadata = {"text": post} | metadata  # Combine metadata with post
        enriched_posts.append(post_with_metadata)

    if processed_file_path:
        with open(processed_file_path, "w", encoding="utf-8") as outfile:
            json.dump(enriched_posts, outfile, indent=4)

    return enriched_posts


def get_posts_by_persona(persona_name):
    """Fetch all posts for a given persona."""
    conn = sqlite3.connect("personas.db")
    cursor = conn.cursor()

    # Fetch persona ID
    cursor.execute("SELECT persona_id FROM personas WHERE name = ?", (persona_name,))
    persona = cursor.fetchone()

    if not persona:
        print(f"Persona '{persona_name}' not found.")
        return []

    persona_id = persona[0]

    # Fetch posts for this persona
    cursor.execute("SELECT text_blocks FROM posts WHERE persona_id = ?", (persona_id,))
    posts = [row[0] for row in cursor.fetchall()]

    conn.close()
    return posts

    unified_tags = get_unified_tags(enriched_posts)
    for post in enriched_posts:
        current_tags = post['tags']
        new_tags = {unified_tags[tag] for tag in current_tags}
        post['tags'] = list(new_tags)

    with open(processed_file_path, encoding='utf-8', mode="w") as outfile:
        json.dump(enriched_posts, outfile, indent=4)


def extract_metadata(post):
    template = '''
    You are given a LinkedIn post. You need to extract number of lines, language of the post and tags.
    1. Return a valid JSON. No preamble. 
    2. JSON object should have exactly three keys: line_count, language and tags. 
    3. tags is an array of text tags. Extract maximum two tags.
    4. Language should be English, Kannada and Hindi

    Here is the actual post on which you need to perform this task:  
    {post}
    '''

    pt = PromptTemplate.from_template(template)
    chain = pt | llm
    response = chain.invoke(input={"post": post})

    try:
        json_parser = JsonOutputParser()
        res = json_parser.parse(response.content)
    except OutputParserException:
        raise OutputParserException("Context too big. Unable to parse jobs.")
    return res


def get_unified_tags(posts_with_metadata):
    unique_tags = set()
    # Loop through each post and extract the tags
    for post in posts_with_metadata:
        unique_tags.update(post['tags'])  # Add the tags to the set

    unique_tags_list = ','.join(unique_tags)

    template = '''I will give you a list of tags. You need to unify tags with the following requirements,
    1. Tags are unified and merged to create a shorter list. 
       Example 1: "Jobseekers", "Job Hunting" can be all merged into a single tag "Job Search". 
       Example 2: "Motivation", "Inspiration", "Drive" can be mapped to "Motivation"
       Example 3: "Personal Growth", "Personal Development", "Self Improvement" can be mapped to "Self Improvement"
       Example 4: "Scam Alert", "Job Scam" etc. can be mapped to "Scams"
       Example 5: "Finance", "economics", "currency" etc., can be mapped to "Financial literacy"
    2. Each tag should be follow title case convention. example: "Motivation", "Job Search"
    3. Output should be a JSON object, No preamble
    3. Output should have mapping of original tag and the unified tag. 
       For example: {{"Jobseekers": "Job Search",  "Job Hunting": "Job Search", "Motivation": "Motivation}}

    Here is the list of tags: 
    {tags}
    '''
    pt = PromptTemplate.from_template(template)
    chain = pt | llm
    response = chain.invoke(input={"tags": str(unique_tags_list)})
    try:
        json_parser = JsonOutputParser()
        res = json_parser.parse(response.content)
    except OutputParserException:
        raise OutputParserException("Context too big. Unable to parse jobs.")
    return res




if __name__ == "__main__":
    persona = input("Enter the persona name: ")
    process_posts_for_persona(persona, f"data/processed_{persona.lower()}_posts.json")
 #   process_posts("data/raw_posts.json", "data/processed_posts.json")