import numpy as np import re from tqdm import tqdm import json import pickle from llmgaurdrails.llms.openai_client import invoke_api class LLMBasedQAGenerator: def _create_entry(self, context: str, question: str, answer: str, label: int) -> dict: """Create standardized training entry with validation checks""" # Clean and validate inputs context = self._clean_text(context) question = self._clean_text(question).rstrip("?") + "?" answer = self._clean_answer(answer) if not question or not answer: return None return { "context": context, "question": question, "answer": answer, "label": int(bool(label)), # Force 0/1 encoding "meta": { "context_hash": hash(context), "answer_type": self._classify_answer_type(answer), "question_type": self._classify_question(question) } } def _clean_text(self, text: str) -> str: """Basic text normalization""" return re.sub(r'\s+', ' ', text).strip() def _clean_answer(self, answer: str) -> str: """Answer-specific cleaning""" answer = self._clean_text(answer) if answer.lower() in ["", "n/a", "unknown"]: return "[INVALID]" return answer def _classify_answer_type(self, answer: str) -> str: """Categorize answers for analysis""" if "$" in answer: return "monetary" if "%" in answer: return "percentage" if any(c.isdigit() for c in answer): return "numeric" return "textual" def _classify_question(self, question: str) -> str: """Identify question types""" q = question.lower() if "how much" in q: return "quantity" if "when" in q: return "temporal" if "why" in q: return "reason" return "factual" def _generate_questions_and_grounded_answers(self,chunk,num_questions=3): questions = [] answers =[] # Generate a question and a grounded answer for i in range(num_questions): try: grounded_system_prompt = """You are a helpful assistant that generates questions and answers based on the given context. The question and answer should not exceed 15 words each. The response should ne a json with 'question' and 'answer as the key'""" grounded_message = f"Context: {chunk}\n\nGenerate a question and a grounded answer based on this context." grounded_qa_response = invoke_api(grounded_system_prompt,grounded_message,0.7,max_tokens=100) # print("Question:",grounded_qa_response) grounded_qa = json.loads(grounded_qa_response.choices[0].message.content.strip("```json")) questions.append(grounded_qa['question']) answers.append(grounded_qa['answer']) except: questions.append('') answers.append('') return questions,answers def _generate_ungrounded_answer(self,chunk,question,grounded_answer): try: ungrounded_system_prompt = """You are a helpful assistant that generates questions and ungrounded answers that are based on the given context. But factually or logically incorrect. The 'answer' part of the response should not exceed 15 words each. The response should ne a json with just one key 'answer'""" ungrounded_message = f"Question: {question}\n\nGenerate an ungrounded answer based on the original context {chunk}. Make subtle changes to the actual answer to make it look plausible" ungrounded_answer_response = invoke_api(ungrounded_system_prompt,ungrounded_message,0.7,max_tokens=30) # print("answer:",ungrounded_answer_response) answer_json = json.loads(ungrounded_answer_response.choices[0].message.content.strip("```json")) return answer_json['answer'] except: print("errored in answer") return '' def generate_dataset(self, chunks: list, persist_dataset:bool =False, presisted_file_path: str = "training_data") -> list: dataset = [] for chunk_dict in tqdm(chunks, desc="Generating QA pairs"): chunk = chunk_dict['text'] if not chunk.strip(): continue questions,grounded_answers = self._generate_questions_and_grounded_answers(chunk) for question,grounded_answer in zip(questions,grounded_answers): if not question.strip(): continue ungrounded = self._generate_ungrounded_answer(chunk, question,grounded_answer) dataset.append(self._create_entry(chunk, question, grounded_answer, 1)) dataset.append(self._create_entry(chunk, question, ungrounded, 0)) if persist_dataset: pickle.dump(dataset,open(presisted_file_path,'ab')) return dataset