Spaces:
Sleeping
Sleeping
import json | |
from llm_helper import llm | |
from langchain_core.prompts import PromptTemplate | |
from langchain_core.output_parsers import JsonOutputParser | |
from langchain_core.exceptions import OutputParserException | |
import sqlite3 | |
#def process_posts(raw_file_path, processed_file_path=None): | |
# with open(raw_file_path, encoding='utf-8') as file: | |
# posts = json.load(file) | |
# enriched_posts = [] | |
# for post in posts: | |
# metadata = extract_metadata(post['text_blocks']) | |
# post_with_metadata = post | metadata | |
# enriched_posts.append(post_with_metadata) | |
def process_posts_for_persona(persona_name, processed_file_path=None): | |
posts = get_posts_by_persona(persona_name) | |
if not posts: | |
print(f"No posts found for persona '{persona_name}'.") | |
return | |
enriched_posts = [] | |
for post in posts: | |
metadata = extract_metadata(post) | |
post_with_metadata = {"text": post} | metadata # Combine metadata with post | |
enriched_posts.append(post_with_metadata) | |
if processed_file_path: | |
with open(processed_file_path, "w", encoding="utf-8") as outfile: | |
json.dump(enriched_posts, outfile, indent=4) | |
return enriched_posts | |
def get_posts_by_persona(persona_name): | |
"""Fetch all posts for a given persona.""" | |
conn = sqlite3.connect("personas.db") | |
cursor = conn.cursor() | |
# Fetch persona ID | |
cursor.execute("SELECT persona_id FROM personas WHERE name = ?", (persona_name,)) | |
persona = cursor.fetchone() | |
if not persona: | |
print(f"Persona '{persona_name}' not found.") | |
return [] | |
persona_id = persona[0] | |
# Fetch posts for this persona | |
cursor.execute("SELECT text_blocks FROM posts WHERE persona_id = ?", (persona_id,)) | |
posts = [row[0] for row in cursor.fetchall()] | |
conn.close() | |
return posts | |
unified_tags = get_unified_tags(enriched_posts) | |
for post in enriched_posts: | |
current_tags = post['tags'] | |
new_tags = {unified_tags[tag] for tag in current_tags} | |
post['tags'] = list(new_tags) | |
with open(processed_file_path, encoding='utf-8', mode="w") as outfile: | |
json.dump(enriched_posts, outfile, indent=4) | |
def extract_metadata(post): | |
template = ''' | |
You are given a LinkedIn post. You need to extract number of lines, language of the post and tags. | |
1. Return a valid JSON. No preamble. | |
2. JSON object should have exactly three keys: line_count, language and tags. | |
3. tags is an array of text tags. Extract maximum two tags. | |
4. Language should be English, Kannada and Hindi | |
Here is the actual post on which you need to perform this task: | |
{post} | |
''' | |
pt = PromptTemplate.from_template(template) | |
chain = pt | llm | |
response = chain.invoke(input={"post": post}) | |
try: | |
json_parser = JsonOutputParser() | |
res = json_parser.parse(response.content) | |
except OutputParserException: | |
raise OutputParserException("Context too big. Unable to parse jobs.") | |
return res | |
def get_unified_tags(posts_with_metadata): | |
unique_tags = set() | |
# Loop through each post and extract the tags | |
for post in posts_with_metadata: | |
unique_tags.update(post['tags']) # Add the tags to the set | |
unique_tags_list = ','.join(unique_tags) | |
template = '''I will give you a list of tags. You need to unify tags with the following requirements, | |
1. Tags are unified and merged to create a shorter list. | |
Example 1: "Jobseekers", "Job Hunting" can be all merged into a single tag "Job Search". | |
Example 2: "Motivation", "Inspiration", "Drive" can be mapped to "Motivation" | |
Example 3: "Personal Growth", "Personal Development", "Self Improvement" can be mapped to "Self Improvement" | |
Example 4: "Scam Alert", "Job Scam" etc. can be mapped to "Scams" | |
Example 5: "Finance", "economics", "currency" etc., can be mapped to "Financial literacy" | |
2. Each tag should be follow title case convention. example: "Motivation", "Job Search" | |
3. Output should be a JSON object, No preamble | |
3. Output should have mapping of original tag and the unified tag. | |
For example: {{"Jobseekers": "Job Search", "Job Hunting": "Job Search", "Motivation": "Motivation}} | |
Here is the list of tags: | |
{tags} | |
''' | |
pt = PromptTemplate.from_template(template) | |
chain = pt | llm | |
response = chain.invoke(input={"tags": str(unique_tags_list)}) | |
try: | |
json_parser = JsonOutputParser() | |
res = json_parser.parse(response.content) | |
except OutputParserException: | |
raise OutputParserException("Context too big. Unable to parse jobs.") | |
return res | |
if __name__ == "__main__": | |
persona = input("Enter the persona name: ") | |
process_posts_for_persona(persona, f"data/processed_{persona.lower()}_posts.json") | |
# process_posts("data/raw_posts.json", "data/processed_posts.json") |