diff --git "a/model.py" "b/model.py"
--- "a/model.py"
+++ "b/model.py"
@@ -1,1420 +1,1519 @@
-import re
-import pycountry
-from docx import Document
-import json
-import os
-import numpy as np
-import faiss
-from collections import defaultdict
-import ast # For literal_eval
-import math # For ceiling function
-import data_preprocess
-import mtdna_classifier
-# --- IMPORTANT: UNCOMMENT AND CONFIGURE YOUR REAL API KEY ---
-import google.generativeai as genai
-
-#genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
-genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP"))
-
-import nltk
-from nltk.corpus import stopwords
-try:
-    nltk.data.find('corpora/stopwords')
-except LookupError:
-    nltk.download('stopwords')
-nltk.download('punkt_tab')    
-# # --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---
-# # Prices are per 1,000 tokens
-# PRICE_PER_1K_INPUT_LLM = 0.000075  # $0.075 per 1M tokens
-# PRICE_PER_1K_OUTPUT_LLM = 0.0003   # $0.30 per 1M tokens
-# PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
-
-# Gemini 2.5 Flash-Lite pricing per 1,000 tokens
-PRICE_PER_1K_INPUT_LLM = 0.00010      # $0.10 per 1M input tokens
-PRICE_PER_1K_OUTPUT_LLM = 0.00040     # $0.40 per 1M output tokens
-
-# Embedding-001 pricing per 1,000 input tokens
-PRICE_PER_1K_EMBEDDING_INPUT = 0.00015  # $0.15 per 1M input tokens
-# --- API Functions (REAL API FUNCTIONS) ---
-
-# def get_embedding(text, task_type="RETRIEVAL_DOCUMENT"):
-#     """Generates an embedding for the given text using a Google embedding model."""
-#     try:
-#         result = genai.embed_content(
-#             model="models/text-embedding-004", # Specify the embedding model
-#             content=text,
-#             task_type=task_type
-#         )
-#         return np.array(result['embedding']).astype('float32')
-#     except Exception as e:
-#         print(f"Error getting embedding: {e}")
-#         return np.zeros(768, dtype='float32')
-def get_embedding(text, task_type="RETRIEVAL_DOCUMENT"):
-    """Safe Gemini 1.5 embedding call with fallback."""
-    import numpy as np
-    try:
-        if not text or len(text.strip()) == 0:
-            raise ValueError("Empty text cannot be embedded.")
-        result = genai.embed_content(
-            model="models/text-embedding-004",
-            content=text,
-            task_type=task_type
-        )
-        return np.array(result['embedding'], dtype='float32')
-    except Exception as e:
-        print(f"❌ Embedding error: {e}")
-        return np.zeros(768, dtype='float32')
-
-
-def call_llm_api(prompt, model_name="gemini-2.5-flash-lite"):#'gemini-1.5-flash-latest'):
-    """Calls a Google Gemini LLM with the given prompt."""
-    try:
-        model = genai.GenerativeModel(model_name)
-        response = model.generate_content(prompt)
-        return response.text, model # Return model instance for token counting
-    except Exception as e:
-        print(f"Error calling LLM: {e}")
-        return "Error: Could not get response from LLM API.", None
-
-
-# --- Core Document Processing Functions (All previously provided and fixed) ---
-
-def read_docx_text(path):
-    """
-    Reads text and extracts potential table-like strings from a .docx document.
-    Separates plain text from structured [ [ ] ] list-like tables.
-    Also attempts to extract a document title.
-    """
-    doc = Document(path)
-    plain_text_paragraphs = []
-    table_strings = []
-    document_title = "Unknown Document Title" # Default
-
-    # Attempt to extract the document title from the first few paragraphs
-    title_paragraphs = [p.text.strip() for p in doc.paragraphs[:5] if p.text.strip()]
-    if title_paragraphs:
-        # A heuristic to find a title: often the first or second non-empty paragraph
-        # or a very long first paragraph if it's the title
-        if len(title_paragraphs[0]) > 50 and "Human Genetics" not in title_paragraphs[0]:
-            document_title = title_paragraphs[0]
-        elif len(title_paragraphs) > 1 and len(title_paragraphs[1]) > 50 and "Human Genetics" not in title_paragraphs[1]:
-            document_title = title_paragraphs[1]
-        elif any("Complete mitochondrial genomes" in p for p in title_paragraphs):
-            # Fallback to a known title phrase if present
-            document_title = "Complete mitochondrial genomes of Thai and Lao populations indicate an ancient origin of Austroasiatic groups and demic diffusion in the spread of Tai–Kadai languages"
-
-    current_table_lines = []
-    in_table_parsing_mode = False
-
-    for p in doc.paragraphs:
-        text = p.text.strip()
-        if not text:
-            continue
-
-        # Condition to start or continue table parsing
-        if text.startswith("## Table "): # Start of a new table section
-            if in_table_parsing_mode and current_table_lines:
-                table_strings.append("\n".join(current_table_lines))
-            current_table_lines = [text] # Include the "## Table X" line
-            in_table_parsing_mode = True
-        elif in_table_parsing_mode and (text.startswith("[") or text.startswith('"')):
-            # Continue collecting lines if we're in table mode and it looks like table data
-            # Table data often starts with '[' for lists, or '"' for quoted strings within lists.
-            current_table_lines.append(text)
-        else:
-            # If not in table mode, or if a line doesn't look like table data,
-            # then close the current table (if any) and add the line to plain text.
-            if in_table_parsing_mode and current_table_lines:
-                table_strings.append("\n".join(current_table_lines))
-                current_table_lines = []
-            in_table_parsing_mode = False
-            plain_text_paragraphs.append(text)
-
-    # After the loop, add any remaining table lines
-    if current_table_lines:
-        table_strings.append("\n".join(current_table_lines))
-
-    return "\n".join(plain_text_paragraphs), table_strings, document_title
-
-# --- Structured Data Extraction and RAG Functions ---
-
-def parse_literal_python_list(table_str):
-    list_match = re.search(r'(\[\s*\[\s*(?:.|\n)*?\s*\]\s*\])', table_str)
-    #print("Debug: list_match object (before if check):", list_match)
-    if not list_match:
-        if "table" in table_str.lower(): # then the table doest have the "]]" at the end
-            table_str += "]]"
-            list_match = re.search(r'(\[\s*\[\s*(?:.|\n)*?\s*\]\s*\])', table_str)
-    if list_match:
-        try:
-            matched_string = list_match.group(1)
-            #print("Debug: Matched string for literal_eval:", matched_string)
-            return ast.literal_eval(matched_string)
-        except (ValueError, SyntaxError) as e:
-            print(f"Error evaluating literal: {e}")
-            return []
-    return []
-
-
-_individual_code_parser = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
-def _parse_individual_code_parts(code_str):
-    match = _individual_code_parser.search(code_str)
-    if match:
-        return match.group(1), match.group(2)
-    return None, None
-
-
-def parse_sample_id_to_population_code(plain_text_content):
-    sample_id_map = {}
-    contiguous_ranges_data = defaultdict(list)
-
-    #section_start_marker = "The sample identification of each population is as follows:"
-    section_start_marker = ["The sample identification of each population is as follows:","## table"]
-    
-    for s in section_start_marker:
-      relevant_text_search = re.search(
-          re.escape(s.lower()) + r"\s*(.*?)(?=\n##|\Z)",
-          plain_text_content.lower(),
-          re.DOTALL
-      )
-      if relevant_text_search: 
-        break
-      
-    if not relevant_text_search:
-        print("Warning: 'Sample ID Population Code' section start marker not found or block empty.")
-        return sample_id_map, contiguous_ranges_data
-
-    relevant_text_block = relevant_text_search.group(1).strip()
-
-    # print(f"\nDEBUG_PARSING: --- Start of relevant_text_block (first 500 chars) ---")
-    # print(relevant_text_block[:500])
-    # print(f"DEBUG_PARSING: --- End of relevant_text_block (last 500 chars) ---")
-    # print(relevant_text_block[-500:])
-    # print(f"DEBUG_PARSING: Relevant text block length: {len(relevant_text_block)}")
-
-    mapping_pattern = re.compile(
-    r'\b([A-Z0-9]+\d+)(?:-([A-Z0-9]+\d+))?\s+([A-Z0-9]+)\b', # Changed the last group
-    re.IGNORECASE)
-
-    range_expansion_count = 0
-    direct_id_count = 0
-    total_matches_found = 0
-    for match in mapping_pattern.finditer(relevant_text_block):
-        total_matches_found += 1
-        id1_full_str, id2_full_str_opt, pop_code = match.groups()
-
-        #print(f"  DEBUG_PARSING: Matched: '{match.group(0)}'")
-
-        pop_code_upper = pop_code.upper()
-
-        id1_prefix, id1_num_str = _parse_individual_code_parts(id1_full_str)
-        if id1_prefix is None:
-            #print(f"    DEBUG_PARSING: Failed to parse ID1: {id1_full_str}. Skipping this mapping.")
-            continue
-
-        if id2_full_str_opt:
-            id2_prefix_opt, id2_num_str_opt = _parse_individual_code_parts(id2_full_str_opt)
-            if id2_prefix_opt is None:
-                #print(f"    DEBUG_PARSING: Failed to parse ID2: {id2_full_str_opt}. Treating {id1_full_str} as single ID1.")
-                sample_id_map[f"{id1_prefix.upper()}{id1_num_str}"] = pop_code_upper
-                direct_id_count += 1
-                continue
-
-            #print(f"    DEBUG_PARSING: Comparing prefixes: '{id1_prefix.lower()}' vs '{id2_prefix_opt.lower()}'")
-            if id1_prefix.lower() == id2_prefix_opt.lower():
-                #print(f"    DEBUG_PARSING: ---> Prefixes MATCH for range expansion! Range: {id1_prefix}{id1_num_str}-{id2_prefix_opt}{id2_num_str_opt}")
-                try:
-                    start_num = int(id1_num_str)
-                    end_num = int(id2_num_str_opt)
-                    for num in range(start_num, end_num + 1):
-                        sample_id = f"{id1_prefix.upper()}{num}"
-                        sample_id_map[sample_id] = pop_code_upper
-                        range_expansion_count += 1
-                    contiguous_ranges_data[id1_prefix.upper()].append(
-                        (start_num, end_num, pop_code_upper)
-                    )
-                except ValueError:
-                    print(f"        DEBUG_PARSING: ValueError in range conversion for {id1_num_str}-{id2_num_str_opt}. Adding endpoints only.")
-                    sample_id_map[f"{id1_prefix.upper()}{id1_num_str}"] = pop_code_upper
-                    sample_id_map[f"{id2_prefix_opt.upper()}{id2_num_str_opt}"] = pop_code_upper
-                    direct_id_count += 2
-            else:
-                #print(f"    DEBUG_PARSING: Prefixes MISMATCH for range: '{id1_prefix}' vs '{id2_prefix_opt}'. Adding endpoints only.")
-                sample_id_map[f"{id1_prefix.upper()}{id1_num_str}"] = pop_code_upper
-                sample_id_map[f"{id2_prefix_opt.upper()}{id2_num_str_opt}"] = pop_code_upper
-                direct_id_count += 2
-        else:
-            sample_id_map[f"{id1_prefix.upper()}{id1_num_str}"] = pop_code_upper
-            direct_id_count += 1
-
-    # print(f"DEBUG_PARSING: Total matches found by regex: {total_matches_found}.")
-    # print(f"DEBUG_PARSING: Parsed sample IDs: {len(sample_id_map)} total entries.")
-    # print(f"DEBUG_PARSING:   (including {range_expansion_count} from range expansion and {direct_id_count} direct ID/endpoint entries).")
-    return sample_id_map, contiguous_ranges_data
-
-country_keywords_regional_overrides = {
-    "north thailand": "Thailand", "central thailand": "Thailand",
-    "northeast thailand": "Thailand", "east myanmar": "Myanmar", "west thailand": "Thailand",
-    "central india": "India", "east india": "India", "northeast india": "India",
-    "south sibera": "Russia", "siberia": "Russia", "yunnan": "China", #"tibet": "China",
-    "sumatra": "Indonesia", "borneo": "Indonesia",
-    "northern mindanao": "Philippines", "west malaysia": "Malaysia",
-    "mongolia": "China",
-    "beijing": "China",
-    "north laos": "Laos", "central laos": "Laos",
-    "east myanmar": "Myanmar", "west myanmar": "Myanmar"}
-
-# Updated get_country_from_text function
-def get_country_from_text(text):
-    text_lower = text.lower()
-
-    # 1. Use pycountry for official country names and common aliases
-    for country in pycountry.countries:
-        # Check full name match first
-        if text_lower == country.name.lower():
-            return country.name
-        
-        # Safely check for common_name
-        if hasattr(country, 'common_name') and text_lower == country.common_name.lower():
-            return country.common_name
-            
-        # Safely check for official_name
-        if hasattr(country, 'official_name') and text_lower == country.official_name.lower():
-            return country.official_name
-
-        # Check if country name is part of the text (e.g., 'Thailand' in 'Thailand border')
-        if country.name.lower() in text_lower:
-            return country.name
-            
-        # Safely check if common_name is part of the text
-        if hasattr(country, 'common_name') and country.common_name.lower() in text_lower:
-            return country.common_name
-    # 2. Prioritize specific regional overrides
-    for keyword, country in country_keywords_regional_overrides.items():
-        if keyword in text_lower:
-            return country
-    # 3. Check for broader regions that you want to map to "unknown" or a specific country
-    if "north asia" in text_lower or "southeast asia" in text_lower or "east asia" in text_lower:
-        return "unknown"
-
-    return "unknown"
-
-# Get the list of English stop words from NLTK
-non_meaningful_pop_names = set(stopwords.words('english'))
-
-def parse_population_code_to_country(plain_text_content, table_strings):
-    pop_code_country_map = {}
-    pop_code_ethnicity_map = {} # NEW: To store ethnicity for structured lookup
-    pop_code_specific_loc_map = {} # NEW: To store specific location for structured lookup
-
-    # Regex for parsing population info in structured lists and general text
-    # This pattern captures: (Pop Name/Ethnicity) (Pop Code) (Region/Specific Location) (Country) (Linguistic Family)
-    # The 'Pop Name/Ethnicity' (Group 1) is often the ethnicity
-    pop_info_pattern = re.compile(
-          r'([A-Za-z\s]+?)\s+([A-Z]+\d*)\s+'      # Pop Name (Group 1), Pop Code (Group 2) - Changed \d+ to \d* for codes like 'SH'
-          r'([A-Za-z\s\(\)\-,\/]+?)\s+'          # Region/Specific Location (Group 3)
-          r'(North+|South+|West+|East+|Thailand|Laos|Cambodia|Myanmar|Philippines|Indonesia|Malaysia|China|India|Taiwan|Vietnam|Russia|Nepal|Japan|South Korea)\b' # Country (Group 4)
-          r'(?:.*?([A-Za-z\s\-]+))?\s*'          # Optional Linguistic Family (Group 5), made optional with ?, followed by optional space
-          r'(\d+(?:\s+\d+\.?\d*)*)?', # Match all the numbers (Group 6) - made optional
-          re.IGNORECASE
-      )
-    for table_str in table_strings:
-        table_data = parse_literal_python_list(table_str)
-        if table_data:
-            is_list_of_lists = bool(table_data) and isinstance(table_data[0], list)
-            if is_list_of_lists:
-                for row_idx, row in enumerate(table_data):
-                    row_text = " ".join(map(str, row))
-                    match = pop_info_pattern.search(row_text)
-                    if match:
-                        pop_name = match.group(1).strip()
-                        pop_code = match.group(2).upper()
-                        specific_loc_text = match.group(3).strip()
-                        country_text = match.group(4).strip()
-                        linguistic_family = match.group(5).strip() if match.group(5) else 'unknown'
-
-                        final_country = get_country_from_text(country_text)
-                        if final_country == 'unknown': # Try specific loc text for country if direct country is not found
-                            final_country = get_country_from_text(specific_loc_text)
-
-                        if pop_code:
-                            pop_code_country_map[pop_code] = final_country
-
-                            # Populate ethnicity map (often Pop Name is ethnicity)
-                            pop_code_ethnicity_map[pop_code] = pop_name
-
-                            # Populate specific location map
-                            pop_code_specific_loc_map[pop_code] = specific_loc_text # Store as is from text
-            else:
-                row_text = " ".join(map(str, table_data))   
-                match = pop_info_pattern.search(row_text)
-                if match:
-                    pop_name = match.group(1).strip()
-                    pop_code = match.group(2).upper()
-                    specific_loc_text = match.group(3).strip()
-                    country_text = match.group(4).strip()
-                    linguistic_family = match.group(5).strip() if match.group(5) else 'unknown'
-
-                    final_country = get_country_from_text(country_text)
-                    if final_country == 'unknown': # Try specific loc text for country if direct country is not found
-                        final_country = get_country_from_text(specific_loc_text)
-
-                    if pop_code:
-                        pop_code_country_map[pop_code] = final_country
-
-                        # Populate ethnicity map (often Pop Name is ethnicity)
-                        pop_code_ethnicity_map[pop_code] = pop_name
-
-                        # Populate specific location map
-                        pop_code_specific_loc_map[pop_code] = specific_loc_text # Store as is from text
-
-                        # # Special case refinements for ethnicity/location if more specific rules are known from document:
-                        # if pop_name.lower() == "khon mueang": # and specific conditions if needed
-                        #     pop_code_ethnicity_map[pop_code] = "Khon Mueang"
-                        #     # If Khon Mueang has a specific city/district, add here
-                        #     # e.g., if 'Chiang Mai' is directly linked to KM1 in a specific table
-                        #     # pop_code_specific_loc_map[pop_code] = "Chiang Mai"
-                        # elif pop_name.lower() == "lawa":
-                        #      pop_code_ethnicity_map[pop_code] = "Lawa"
-                        # # Add similar specific rules for other populations (e.g., Mon for MO1, MO2, MO3)
-                        # elif pop_name.lower() == "mon":
-                        #     pop_code_ethnicity_map[pop_code] = "Mon"
-                        #     # For MO2: "West Thailand (Thailand Myanmar border)" -> no city
-                        #     # For MO3: "East Myanmar (Thailand Myanmar border)" -> no city
-                        #     # If the doc gives "Bangkok" for MO4, add it here for MO4's actual specific_location.
-                        # # etc.
-
-    # Fallback to parsing general plain text content (sentences)
-    sentences = data_preprocess.extract_sentences(plain_text_content)
-    for s in sentences: # Still focusing on just this one sentence
-      # Use re.finditer to get all matches
-      matches = pop_info_pattern.finditer(s)
-      pop_name, pop_code, specific_loc_text, country_text = "unknown", "unknown", "unknown", "unknown"
-      for match in matches:
-          if match.group(1):
-            pop_name = match.group(1).strip()
-          if match.group(2):  
-            pop_code = match.group(2).upper()
-          if match.group(3):  
-            specific_loc_text = match.group(3).strip()
-          if match.group(4):  
-            country_text = match.group(4).strip()
-          # linguistic_family = match.group(5).strip() if match.group(5) else 'unknown' # Already captured by pop_info_pattern
-
-          final_country = get_country_from_text(country_text)
-          if final_country == 'unknown':
-              final_country = get_country_from_text(specific_loc_text)
-
-          if pop_code.lower() not in non_meaningful_pop_names:
-            if final_country.lower() not in non_meaningful_pop_names:
-              pop_code_country_map[pop_code] = final_country
-            if pop_name.lower() not in non_meaningful_pop_names:  
-              pop_code_ethnicity_map[pop_code] = pop_name # Default ethnicity from Pop Name
-            if specific_loc_text.lower() not in non_meaningful_pop_names:  
-              pop_code_specific_loc_map[pop_code] = specific_loc_text
-
-              # Specific rules for ethnicity/location in plain text:
-              if pop_name.lower() == "khon mueang":
-                  pop_code_ethnicity_map[pop_code] = "Khon Mueang"
-              elif pop_name.lower() == "lawa":
-                  pop_code_ethnicity_map[pop_code] = "Lawa"
-              elif pop_name.lower() == "mon":
-                  pop_code_ethnicity_map[pop_code] = "Mon"
-              elif pop_name.lower() == "seak": # Added specific rule for Seak
-                  pop_code_ethnicity_map[pop_code] = "Seak"
-              elif pop_name.lower() == "nyaw": # Added specific rule for Nyaw
-                  pop_code_ethnicity_map[pop_code] = "Nyaw"
-              elif pop_name.lower() == "nyahkur": # Added specific rule for Nyahkur
-                  pop_code_ethnicity_map[pop_code] = "Nyahkur"
-              elif pop_name.lower() == "suay": # Added specific rule for Suay
-                  pop_code_ethnicity_map[pop_code] = "Suay"
-              elif pop_name.lower() == "soa": # Added specific rule for Soa
-                  pop_code_ethnicity_map[pop_code] = "Soa"
-              elif pop_name.lower() == "bru": # Added specific rule for Bru
-                  pop_code_ethnicity_map[pop_code] = "Bru"
-              elif pop_name.lower() == "khamu": # Added specific rule for Khamu
-                  pop_code_ethnicity_map[pop_code] = "Khamu"
-
-    return pop_code_country_map, pop_code_ethnicity_map, pop_code_specific_loc_map
-
-def general_parse_population_code_to_country(plain_text_content, table_strings):
-    pop_code_country_map = {}
-    pop_code_ethnicity_map = {}
-    pop_code_specific_loc_map = {}
-    sample_id_to_pop_code = {}
-
-    for table_str in table_strings:
-        table_data = parse_literal_python_list(table_str)
-        if not table_data or not isinstance(table_data[0], list):
-            continue
-
-        header_row = [col.lower() for col in table_data[0]]
-        header_map = {col: idx for idx, col in enumerate(header_row)}
-
-        # MJ17: Direct PopCode → Country
-        if 'id' in header_map and 'country' in header_map:
-            for row in table_strings[1:]:
-                row = parse_literal_python_list(row)[0]
-                if len(row) < len(header_row):
-                    continue
-                pop_code = str(row[header_map['id']]).strip()
-                country = str(row[header_map['country']]).strip()
-                province = row[header_map['province']].strip() if 'province' in header_map else 'unknown'
-                pop_group = row[header_map['population group / region']].strip() if 'population group / region' in header_map else 'unknown'
-                pop_code_country_map[pop_code] = country
-                pop_code_specific_loc_map[pop_code] = province
-                pop_code_ethnicity_map[pop_code] = pop_group
-
-        # A1YU101 or EBK/KSK: SampleID → PopCode
-        elif 'sample id' in header_map and 'population code' in header_map:
-            for row in table_strings[1:]:
-                row = parse_literal_python_list(row)[0]
-                if len(row) < 2:
-                    continue
-                sample_id = row[header_map['sample id']].strip().upper()
-                pop_code = row[header_map['population code']].strip().upper()
-                sample_id_to_pop_code[sample_id] = pop_code
-
-        # PopCode → Country (A1YU101/EBK mapping)
-        elif 'population code' in header_map and 'country' in header_map:
-            for row in table_strings[1:]:
-                row = parse_literal_python_list(row)[0]
-                if len(row) < 2:
-                    continue
-                pop_code = row[header_map['population code']].strip().upper()
-                country = row[header_map['country']].strip()
-                pop_code_country_map[pop_code] = country
-
-    return pop_code_country_map, pop_code_ethnicity_map, pop_code_specific_loc_map, sample_id_to_pop_code
-
-def chunk_text(text, chunk_size=500, overlap=50):
-    """Splits text into chunks (by words) with overlap."""
-    chunks = []
-    words = text.split()
-    num_words = len(words)
-
-    start = 0
-    while start < num_words:
-        end = min(start + chunk_size, num_words)
-        chunk = " ".join(words[start:end])
-        chunks.append(chunk)
-
-        if end == num_words:
-            break
-        start += chunk_size - overlap # Move start by (chunk_size - overlap)
-    return chunks
-
-def build_vector_index_and_data(doc_path, index_path="faiss_index.bin", chunks_path="document_chunks.json", structured_path="structured_lookup.json"):
-    """
-    Reads document, builds structured lookup, chunks remaining text, embeds chunks,
-    and builds/saves a FAISS index.
-    """
-    print("Step 1: Reading document and extracting structured data...")
-    # plain_text_content, table_strings, document_title = read_docx_text(doc_path) # Get document_title here
-
-    # sample_id_map, contiguous_ranges_data = parse_sample_id_to_population_code(plain_text_content)
-    # pop_code_to_country, pop_code_to_ethnicity, pop_code_to_specific_loc = parse_population_code_to_country(plain_text_content, table_strings)
-
-    # master_structured_lookup = {}
-    # master_structured_lookup['document_title'] = document_title # Store document title
-    # master_structured_lookup['sample_id_map'] = sample_id_map
-    # master_structured_lookup['contiguous_ranges'] = dict(contiguous_ranges_data)
-    # master_structured_lookup['pop_code_to_country'] = pop_code_to_country
-    # master_structured_lookup['pop_code_to_ethnicity'] = pop_code_to_ethnicity # NEW: Store pop_code to ethnicity map
-    # master_structured_lookup['pop_code_to_specific_loc'] = pop_code_to_specific_loc # NEW: Store pop_code to specific_loc map
-
-
-    # # Final consolidation: Use sample_id_map to derive full info for queries
-    # final_structured_entries = {}
-    # for sample_id, pop_code in master_structured_lookup['sample_id_map'].items():
-    #     country = master_structured_lookup['pop_code_to_country'].get(pop_code, 'unknown')
-    #     ethnicity = master_structured_lookup['pop_code_to_ethnicity'].get(pop_code, 'unknown') # Retrieve ethnicity
-    #     specific_location = master_structured_lookup['pop_code_to_specific_loc'].get(pop_code, 'unknown') # Retrieve specific location
-
-    #     final_structured_entries[sample_id] = {
-    #         'population_code': pop_code,
-    #         'country': country,
-    #         'type': 'modern',
-    #         'ethnicity': ethnicity, # Store ethnicity
-    #         'specific_location': specific_location # Store specific location
-    #     }
-    # master_structured_lookup['final_structured_entries'] = final_structured_entries
-    plain_text_content, table_strings, document_title = read_docx_text(doc_path)
-    pop_code_to_country, pop_code_to_ethnicity, pop_code_to_specific_loc, sample_id_map = general_parse_population_code_to_country(plain_text_content, table_strings)
-
-    final_structured_entries = {}
-    if sample_id_map:
-        for sample_id, pop_code in sample_id_map.items():
-            country = pop_code_to_country.get(pop_code, 'unknown')
-            ethnicity = pop_code_to_ethnicity.get(pop_code, 'unknown')
-            specific_loc = pop_code_to_specific_loc.get(pop_code, 'unknown')
-            final_structured_entries[sample_id] = {
-                'population_code': pop_code,
-                'country': country,
-                'type': 'modern',
-                'ethnicity': ethnicity,
-                'specific_location': specific_loc
-            }
-    else:
-        for pop_code in pop_code_to_country.keys():
-            country = pop_code_to_country.get(pop_code, 'unknown')
-            ethnicity = pop_code_to_ethnicity.get(pop_code, 'unknown')
-            specific_loc = pop_code_to_specific_loc.get(pop_code, 'unknown')
-            final_structured_entries[pop_code] = {
-                'population_code': pop_code,
-                'country': country,
-                'type': 'modern',
-                'ethnicity': ethnicity,
-                'specific_location': specific_loc
-            }
-    if not final_structured_entries:
-      # traditional way of A1YU101
-      sample_id_map, contiguous_ranges_data = parse_sample_id_to_population_code(plain_text_content)
-      pop_code_to_country, pop_code_to_ethnicity, pop_code_to_specific_loc = parse_population_code_to_country(plain_text_content, table_strings)
-      if sample_id_map:
-        for sample_id, pop_code in sample_id_map.items():
-            country = pop_code_to_country.get(pop_code, 'unknown')
-            ethnicity = pop_code_to_ethnicity.get(pop_code, 'unknown')
-            specific_loc = pop_code_to_specific_loc.get(pop_code, 'unknown')
-            final_structured_entries[sample_id] = {
-                'population_code': pop_code,
-                'country': country,
-                'type': 'modern',
-                'ethnicity': ethnicity,
-                'specific_location': specific_loc
-            }
-      else:
-          for pop_code in pop_code_to_country.keys():
-              country = pop_code_to_country.get(pop_code, 'unknown')
-              ethnicity = pop_code_to_ethnicity.get(pop_code, 'unknown')
-              specific_loc = pop_code_to_specific_loc.get(pop_code, 'unknown')
-              final_structured_entries[pop_code] = {
-                  'population_code': pop_code,
-                  'country': country,
-                  'type': 'modern',
-                  'ethnicity': ethnicity,
-                  'specific_location': specific_loc
-              }
-    
-    master_lookup = {
-        'document_title': document_title,
-        'pop_code_to_country': pop_code_to_country,
-        'pop_code_to_ethnicity': pop_code_to_ethnicity,
-        'pop_code_to_specific_loc': pop_code_to_specific_loc,
-        'sample_id_map': sample_id_map,
-        'final_structured_entries': final_structured_entries
-    }
-    print(f"Structured lookup built with {len(final_structured_entries)} entries in 'final_structured_entries'.")
-
-    with open(structured_path, 'w') as f:
-        json.dump(master_lookup, f, indent=4)
-    print(f"Structured lookup saved to {structured_path}.")
-
-    print("Step 2: Chunking document for RAG vector index...")
-    # replace the chunk here with the all_output from process_inputToken and fallback to this traditional chunk
-    clean_text, clean_table = "", ""
-    if plain_text_content:
-      clean_text = data_preprocess.normalize_for_overlap(plain_text_content)
-    if table_strings:
-      clean_table = data_preprocess.normalize_for_overlap(". ".join(table_strings))
-    all_clean_chunk = clean_text + clean_table
-    document_chunks = chunk_text(all_clean_chunk)
-    print(f"Document chunked into {len(document_chunks)} chunks.")
-    
-    print("Step 3: Generating embeddings for chunks (this might take time and cost API calls)...")
-
-    embedding_model_for_chunks = genai.GenerativeModel('models/text-embedding-004')
-
-    chunk_embeddings = []
-    for i, chunk in enumerate(document_chunks):
-        embedding = get_embedding(chunk, task_type="RETRIEVAL_DOCUMENT")
-        if embedding is not None and embedding.shape[0] > 0:
-            chunk_embeddings.append(embedding)
-        else:
-            print(f"Warning: Failed to get valid embedding for chunk {i}. Skipping.")
-            chunk_embeddings.append(np.zeros(768, dtype='float32'))
-
-    if not chunk_embeddings:
-        raise ValueError("No valid embeddings generated. Check get_embedding function and API.")
-
-    embedding_dimension = chunk_embeddings[0].shape[0]
-    index = faiss.IndexFlatL2(embedding_dimension)
-    index.add(np.array(chunk_embeddings))
-
-    faiss.write_index(index, index_path)
-    with open(chunks_path, "w") as f:
-        json.dump(document_chunks, f)
-
-    print(f"FAISS index built and saved to {index_path}.")
-    print(f"Document chunks saved to {chunks_path}.")
-    return master_lookup, index, document_chunks, all_clean_chunk
-
-
-def load_rag_assets(index_path="faiss_index.bin", chunks_path="document_chunks.json", structured_path="structured_lookup.json"):
-    """Loads pre-built RAG assets (FAISS index, chunks, structured lookup)."""
-    print("Loading RAG assets...")
-    master_structured_lookup = {}
-    if os.path.exists(structured_path):
-        with open(structured_path, 'r') as f:
-            master_structured_lookup = json.load(f)
-        print("Structured lookup loaded.")
-    else:
-        print("Structured lookup file not found. Rebuilding is likely needed.")
-
-    index = None
-    chunks = []
-    if os.path.exists(index_path) and os.path.exists(chunks_path):
-        try:
-            index = faiss.read_index(index_path)
-            with open(chunks_path, "r") as f:
-                chunks = json.load(f)
-            print("FAISS index and chunks loaded.")
-        except Exception as e:
-            print(f"Error loading FAISS index or chunks: {e}. Will rebuild.")
-            index = None
-            chunks = []
-    else:
-        print("FAISS index or chunks files not found.")
-
-    return master_structured_lookup, index, chunks
-# Helper function for query_document_info
-def exactInContext(text, keyword):
-# try keyword_prfix
-  # code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
-  # # Attempt to parse the keyword into its prefix and numerical part using re.search
-  # keyword_match = code_pattern.search(keyword)
-  # keyword_prefix = None
-  # keyword_num = None
-  # if keyword_match:
-  #     keyword_prefix = keyword_match.group(1).lower()
-  #     keyword_num = int(keyword_match.group(2))
-  text = text.lower()
-  idx = text.find(keyword.lower())
-  if idx == -1:
-    # if keyword_prefix:
-    #   idx = text.find(keyword_prefix)
-    # if idx == -1:
-    #   return False
-    return False
-  return True
-def chooseContextLLM(contexts, kw):
-  # if kw in context
-  for con in contexts:
-    context = contexts[con]
-    if context:
-      if exactInContext(context, kw):
-        return con, context    
-  #if cannot find anything related to kw in context, return all output
-  if contexts["all_output"]:
-    return "all_output", contexts["all_output"]
-  else:
-    # if all_output not exist
-    # look of chunk and still not exist return document chunk
-    if contexts["chunk"]: return "chunk", contexts["chunk"]
-    elif contexts["document_chunk"]:  return "document_chunk", contexts["document_chunk"]
-    else: return None, None  
-def clean_llm_output(llm_response_text, output_format_str):
-    results = []
-    lines = llm_response_text.strip().split('\n')
-    output_country, output_type, output_ethnicity, output_specific_location = [],[],[],[]
-    for line in lines:
-        extracted_country, extracted_type, extracted_ethnicity, extracted_specific_location = "unknown", "unknown", "unknown", "unknown"
-        line = line.strip()
-        if output_format_str == "ethnicity, specific_location/unknown": # Targeted RAG output
-            parsed_output = re.search(r'^\s*([^,]+?),\s*(.+?)\s*$', llm_response_text)
-            if parsed_output:
-                extracted_ethnicity = parsed_output.group(1).strip()
-                extracted_specific_location = parsed_output.group(2).strip()
-            else:
-                print("  DEBUG: LLM did not follow expected 2-field format for targeted RAG. Defaulting to unknown for ethnicity/specific_location.")
-                extracted_ethnicity = 'unknown'
-                extracted_specific_location = 'unknown'
-        elif output_format_str == "modern/ancient/unknown, ethnicity, specific_location/unknown":
-          parsed_output = re.search(r'^\s*([^,]+?),\s*([^,]+?),\s*(.+?)\s*$', llm_response_text)
-          if parsed_output:
-              extracted_type = parsed_output.group(1).strip()
-              extracted_ethnicity = parsed_output.group(2).strip()
-              extracted_specific_location = parsed_output.group(3).strip()
-          else:
-              # Fallback: check if only 2 fields
-              parsed_output_2_fields = re.search(r'^\s*([^,]+?),\s*([^,]+?)\s*$', llm_response_text)
-              if parsed_output_2_fields:
-                  extracted_type = parsed_output_2_fields.group(1).strip()
-                  extracted_ethnicity = parsed_output_2_fields.group(2).strip()
-                  extracted_specific_location = 'unknown'
-              else:
-                  # even simpler fallback: 1 field only
-                  parsed_output_1_field = re.search(r'^\s*([^,]+?)\s*$', llm_response_text)
-                  if parsed_output_1_field:
-                      extracted_type = parsed_output_1_field.group(1).strip()
-                      extracted_ethnicity = 'unknown'
-                      extracted_specific_location = 'unknown'
-                  else:
-                      print("  DEBUG: LLM did not follow any expected simplified format. Attempting verbose parsing fallback.")
-                      type_match_fallback = re.search(r'Type:\s*([A-Za-z\s-]+)', llm_response_text)
-                      extracted_type = type_match_fallback.group(1).strip() if type_match_fallback else 'unknown'
-                      extracted_ethnicity = 'unknown'
-                      extracted_specific_location = 'unknown'
-        else:
-          parsed_output = re.search(r'^\s*([^,]+?),\s*([^,]+?),\s*([^,]+?),\s*(.+?)\s*$', line)
-          if parsed_output:
-              extracted_country = parsed_output.group(1).strip()
-              extracted_type = parsed_output.group(2).strip()
-              extracted_ethnicity = parsed_output.group(3).strip()
-              extracted_specific_location = parsed_output.group(4).strip()
-          else:
-              print(f"  DEBUG: Line did not follow expected 4-field format: {line}")
-              parsed_output_2_fields = re.search(r'^\s*([^,]+?),\s*([^,]+?)\s*$', line)
-              if parsed_output_2_fields:
-                  extracted_country = parsed_output_2_fields.group(1).strip()
-                  extracted_type = parsed_output_2_fields.group(2).strip()
-                  extracted_ethnicity = 'unknown'
-                  extracted_specific_location = 'unknown'
-              else:
-                  print(f"  DEBUG: Fallback to verbose-style parsing: {line}")
-                  country_match_fallback = re.search(r'Country:\s*([A-Za-z\s-]+)', line)
-                  type_match_fallback = re.search(r'Type:\s*([A-Za-z\s-]+)', line)
-                  extracted_country = country_match_fallback.group(1).strip() if country_match_fallback else 'unknown'
-                  extracted_type = type_match_fallback.group(1).strip() if type_match_fallback else 'unknown'
-                  extracted_ethnicity = 'unknown'
-                  extracted_specific_location = 'unknown'
-
-        results.append({
-            "country": extracted_country,
-            "type": extracted_type,
-            "ethnicity": extracted_ethnicity,
-            "specific_location": extracted_specific_location
-            #"country_explain":extracted_country_explain,
-            #"type_explain": extracted_type_explain
-        })
-    # if more than 2 results
-    if output_format_str == "ethnicity, specific_location/unknown":
-      for result in results:
-        if result["ethnicity"] not in output_ethnicity:
-          output_ethnicity.append(result["ethnicity"])
-        if result["specific_location"] not in output_specific_location:  
-          output_specific_location.append(result["specific_location"])
-      return " or ".join(output_ethnicity), " or ".join(output_specific_location)     
-    elif output_format_str == "modern/ancient/unknown, ethnicity, specific_location/unknown":
-      for result in results:
-        if result["type"] not in output_type:
-          output_type.append(result["type"])
-        if result["ethnicity"] not in output_ethnicity:
-          output_ethnicity.append(result["ethnicity"])
-        if result["specific_location"] not in output_specific_location:  
-          output_specific_location.append(result["specific_location"])
-
-      return " or ".join(output_type)," or ".join(output_ethnicity), " or ".join(output_specific_location)    
-    else:
-      for result in results:
-        if result["country"] not in output_country:
-          output_country.append(result["country"])
-        if result["type"] not in output_type:
-          output_type.append(result["type"])
-        if result["ethnicity"] not in output_ethnicity:
-          output_ethnicity.append(result["ethnicity"])
-        if result["specific_location"] not in output_specific_location:  
-          output_specific_location.append(result["specific_location"])
-      return " or ".join(output_country)," or ".join(output_type)," or ".join(output_ethnicity), " or ".join(output_specific_location)           
-
-# def parse_multi_sample_llm_output(raw_response: str, output_format_str):
-#     """
-#     Parse LLM output with possibly multiple metadata lines + shared explanations.
-#     """
-#     lines = [line.strip() for line in raw_response.strip().splitlines() if line.strip()]
-#     metadata_list = []
-#     explanation_lines = []
-#     if output_format_str == "country_name, modern/ancient/unknown":
-#         parts = [x.strip() for x in lines[0].split(",")]
-#         if len(parts)==2:
-#           metadata_list.append({
-#               "country": parts[0],
-#               "sample_type": parts[1]#,
-#               #"ethnicity": parts[2],
-#               #"location": parts[3]
-#           })
-#         if 1<len(lines):
-#           line = lines[1]
-#           if "\n" in line:  line = line.split("\n")
-#           if ". " in line: line = line.split(". ")
-#           if isinstance(line,str): line = [line]
-#           explanation_lines += line
-#     elif output_format_str == "modern/ancient/unknown":
-#       metadata_list.append({
-#           "country": "unknown",
-#           "sample_type": lines[0]#,
-#           #"ethnicity": parts[2],
-#           #"location": parts[3]
-#       })
-#       explanation_lines.append(lines[1])
-
-#     # Assign explanations (optional) to each sample — same explanation reused
-#     for md in metadata_list:
-#         md["country_explanation"] = None
-#         md["sample_type_explanation"] = None
-
-#         if md["country"].lower() != "unknown" and len(explanation_lines) >= 1:
-#             md["country_explanation"] = explanation_lines[0]
-
-#         if md["sample_type"].lower() != "unknown":
-#             if len(explanation_lines) >= 2:
-#                 md["sample_type_explanation"] = explanation_lines[1]
-#             elif len(explanation_lines) == 1 and md["country"].lower() == "unknown":
-#                 md["sample_type_explanation"] = explanation_lines[0]
-#             elif len(explanation_lines) == 1:
-#                 md["sample_type_explanation"] = explanation_lines[0]
-#     return metadata_list
-
-def parse_multi_sample_llm_output(raw_response: str, output_format_str):
-    """
-    Parse LLM output with possibly multiple metadata lines + shared explanations.
-    """
-    metadata_list = {}
-    explanation_lines = []
-    output_answers = raw_response.split("\n")[0].split(", ")
-    explanation_lines =  [x for x in raw_response.split("\n")[1:] if x.strip()]
-    print("raw explanation line which split by new line: ", explanation_lines)
-    if len(explanation_lines) == 1:
-        if len(explanation_lines[0].split(". ")) > len(explanation_lines):
-          explanation_lines =  [x for x in explanation_lines[0].split(". ") if x.strip()]
-          print("explain line split by dot: ", explanation_lines)  
-    output_formats = output_format_str.split(", ")
-    explain = ""
-    # assign output format to its output answer and explanation
-    if output_format_str:
-      outputs = output_format_str.split(", ")
-      for o in range(len(outputs)):
-        output = outputs[o]
-        metadata_list[output] = {"answer":"",
-                                  output+"_explanation":""}   
-        # assign output answers
-        if o < len(output_answers):
-          # check if output_format unexpectedly in the answer such as:  
-          #country_name: Europe, modern/ancient: modern
-          try: 
-            if ": " in output_answers[o]:
-              output_answers[o] = output_answers[o].split(": ")[1]
-          except:
-            pass    
-          # Europe, modern  
-          metadata_list[output]["answer"] = output_answers[o]
-          if "unknown" in metadata_list[output]["answer"].lower():
-            metadata_list[output]["answer"] = "unknown"
-        else:
-          metadata_list[output]["answer"] = "unknown"
-        # assign explanations
-        if metadata_list[output]["answer"] != "unknown":
-          if explanation_lines:
-            explain = explanation_lines.pop(0)
-          else:
-            explain = ". ".join(explanation_lines)      
-          metadata_list[output][output+"_explanation"] = explain
-        else:  
-          metadata_list[output][output+"_explanation"] = "unknown"  
-    return metadata_list
-
-def merge_metadata_outputs(metadata_list):
-    """
-    Merge a list of metadata dicts into one, combining differing values with 'or'.
-    Assumes all dicts have the same keys.
-    """
-    if not metadata_list:
-        return {}
-
-    merged = {}
-    keys = metadata_list[0].keys()
-
-    for key in keys:
-        values = [md[key] for md in metadata_list if key in md]
-        unique_values = list(dict.fromkeys(values))  # preserve order, remove dupes
-        if "unknown" in unique_values:
-          unique_values.pop(unique_values.index("unknown"))
-        if len(unique_values) == 1:
-            merged[key] = unique_values[0]
-        else:
-            merged[key] = " or ".join(unique_values)
-
-    return merged
-
-
-def query_document_info(query_word, alternative_query_word, metadata, master_structured_lookup, faiss_index, document_chunks, llm_api_function, chunk=None, all_output=None, model_ai=None):
-    """
-    Queries the document using a hybrid approach:
-    1. Local structured lookup (fast, cheap, accurate for known patterns).
-    2. RAG with semantic search and LLM (general, flexible, cost-optimized).
-    """
-    print("inside the model.query_doc_info")
-    if model_ai:
-      if model_ai == "gemini-1.5-flash-latest":
-        genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
-        PRICE_PER_1K_INPUT_LLM = 0.000075  # $0.075 per 1M tokens
-        PRICE_PER_1K_OUTPUT_LLM = 0.0003   # $0.30 per 1M tokens
-        PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
-        global_llm_model_for_counting_tokens = genai.GenerativeModel("gemini-1.5-flash-latest")#('gemini-1.5-flash-latest')
-    else:
-      genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP"))    
-      # Gemini 2.5 Flash-Lite pricing per 1,000 tokens
-      PRICE_PER_1K_INPUT_LLM = 0.00010      # $0.10 per 1M input tokens
-      PRICE_PER_1K_OUTPUT_LLM = 0.00040     # $0.40 per 1M output tokens
-
-      # Embedding-001 pricing per 1,000 input tokens
-      PRICE_PER_1K_EMBEDDING_INPUT = 0.00015  # $0.15 per 1M input tokens
-      global_llm_model_for_counting_tokens = genai.GenerativeModel("gemini-2.5-flash-lite")#('gemini-1.5-flash-latest')
-    
-    if metadata:
-      extracted_country, extracted_specific_location, extracted_ethnicity, extracted_type = metadata["country"], metadata["specific_location"], metadata["ethnicity"], metadata["sample_type"]
-      extracted_col_date, extracted_iso, extracted_title, extracted_features = metadata["collection_date"], metadata["isolate"], metadata["title"], metadata["all_features"]
-    else:
-      extracted_country, extracted_specific_location, extracted_ethnicity, extracted_type = "unknown", "unknown", "unknown", "unknown"
-      extracted_col_date, extracted_iso, extracted_title = "unknown", "unknown", "unknown"
-    # --- NEW: Pre-process alternative_query_word to remove '.X' suffix if present ---
-    if alternative_query_word:
-        alternative_query_word_cleaned = alternative_query_word.split('.')[0]
-    else:
-        alternative_query_word_cleaned = alternative_query_word
-    country_explanation, sample_type_explanation = None, None
-
-    # Use the consolidated final_structured_entries for direct lookup
-    # final_structured_entries = master_structured_lookup.get('final_structured_entries', {})
-    # document_title = master_structured_lookup.get('document_title', 'Unknown Document Title') # Retrieve document title
-
-    # Default values for all extracted fields. These will be updated.
-    method_used = 'unknown' # Will be updated based on the method that yields a result
-    population_code_from_sl = 'unknown' # To pass to RAG prompt if available
-    total_query_cost = 0
-    # Attempt 1: Try primary query_word (e.g., isolate name) with structured lookup
-    # try: 
-    #     print("try attempt 1 in model query")
-    #     structured_info = final_structured_entries.get(query_word.upper())
-    #     if structured_info:
-    #         if extracted_country == 'unknown':
-    #           extracted_country = structured_info['country']
-    #         if extracted_type == 'unknown':
-    #           extracted_type = structured_info['type']
-              
-    #         # if extracted_ethnicity == 'unknown':
-    #         #   extracted_ethnicity = structured_info.get('ethnicity', 'unknown') # Get ethnicity from structured lookup
-    #         # if extracted_specific_location == 'unknown':
-    #         #   extracted_specific_location = structured_info.get('specific_location', 'unknown') # Get specific_location from structured lookup
-    #         population_code_from_sl = structured_info['population_code']
-    #         method_used = "structured_lookup_direct"
-    #         print(f"'{query_word}' found in structured lookup (direct match).")
-    # except:
-    #     print("pass attempt 1 in model query")
-    #     pass 
-    # # Attempt 2: Try primary query_word with heuristic range lookup if direct fails (only if not already resolved)
-    # try:
-    #     print("try attempt 2 in model query")
-    #     if method_used == 'unknown':
-    #         query_prefix, query_num_str = _parse_individual_code_parts(query_word)
-    #         if query_prefix is not None and query_num_str is not None:
-    #             try: query_num = int(query_num_str)
-    #             except ValueError: query_num = None
-    #             if query_num is not None:
-    #                 query_prefix_upper = query_prefix.upper()
-    #                 contiguous_ranges = master_structured_lookup.get('contiguous_ranges', defaultdict(list))
-    #                 pop_code_to_country = master_structured_lookup.get('pop_code_to_country', {})
-    #                 pop_code_to_ethnicity = master_structured_lookup.get('pop_code_to_ethnicity', {})
-    #                 pop_code_to_specific_loc = master_structured_lookup.get('pop_code_to_specific_loc', {})
-    
-    #                 if query_prefix_upper in contiguous_ranges:
-    #                     for start_num, end_num, pop_code_for_range in contiguous_ranges[query_prefix_upper]:
-    #                         if start_num <= query_num <= end_num:
-    #                             country_from_heuristic = pop_code_to_country.get(pop_code_for_range, 'unknown')
-    #                             if country_from_heuristic != 'unknown':
-    #                                 if extracted_country == 'unknown':
-    #                                   extracted_country = country_from_heuristic
-    #                                 if extracted_type == 'unknown':
-    #                                   extracted_type = 'modern'
-    #                                 # if extracted_ethnicity == 'unknown':
-    #                                 #   extracted_ethnicity = pop_code_to_ethnicity.get(pop_code_for_range, 'unknown')
-    #                                 # if extracted_specific_location == 'unknown':
-    #                                 #   extracted_specific_location = pop_code_to_specific_loc.get(pop_code_for_range, 'unknown')
-    #                                 population_code_from_sl = pop_code_for_range
-    #                                 method_used = "structured_lookup_heuristic_range_match"
-    #                                 print(f"'{query_word}' not direct. Heuristic: Falls within range {query_prefix_upper}{start_num}-{query_prefix_upper}{end_num}.")
-    #                                 break
-    #                             else:
-    #                                 print(f"'{query_word}' heuristic match found, but country unknown. Will fall to RAG below.")
-    # except:
-    #     print("pass attempt 2 in model query")                              
-    #     pass
-    # # Attempt 3: If primary query_word failed all structured lookups, try alternative_query_word (cleaned)
-    # try:
-    #     print("try attempt 3 in model query")
-    #     if method_used == 'unknown' and alternative_query_word_cleaned and alternative_query_word_cleaned != query_word:
-    #         print(f"'{query_word}' not found in structured (or heuristic). Trying alternative '{alternative_query_word_cleaned}'.")
-    
-    #         # Try direct lookup for alternative word
-    #         structured_info_alt = final_structured_entries.get(alternative_query_word_cleaned.upper())
-    #         if structured_info_alt:
-    #           if extracted_country == 'unknown':
-    #             extracted_country = structured_info_alt['country']
-    #           if extracted_type == 'unknown':
-    #             extracted_type = structured_info_alt['type']
-    #           # if extracted_ethnicity == 'unknown':
-    #           #   extracted_ethnicity = structured_info_alt.get('ethnicity', 'unknown')
-    #           # if extracted_specific_location == 'unknown':
-    #           #   extracted_specific_location = structured_info_alt.get('specific_location', 'unknown')
-    #           population_code_from_sl = structured_info_alt['population_code']
-    #           method_used = "structured_lookup_alt_direct"
-    #           print(f"Alternative '{alternative_query_word_cleaned}' found in structured lookup (direct match).")
-    #         else:
-    #             # Try heuristic lookup for alternative word
-    #             alt_prefix, alt_num_str = _parse_individual_code_parts(alternative_query_word_cleaned)
-    #             if alt_prefix is not None and alt_num_str is not None:
-    #                 try: alt_num = int(alt_num_str)
-    #                 except ValueError: alt_num = None
-    #                 if alt_num is not None:
-    #                     alt_prefix_upper = alt_prefix.upper()
-    #                     contiguous_ranges = master_structured_lookup.get('contiguous_ranges', defaultdict(list))
-    #                     pop_code_to_country = master_structured_lookup.get('pop_code_to_country', {})
-    #                     pop_code_to_ethnicity = master_structured_lookup.get('pop_code_to_ethnicity', {})
-    #                     pop_code_to_specific_loc = master_structured_lookup.get('pop_code_to_specific_loc', {})
-    #                     if alt_prefix_upper in contiguous_ranges:
-    #                         for start_num, end_num, pop_code_for_range in contiguous_ranges[alt_prefix_upper]:
-    #                             if start_num <= alt_num <= end_num:
-    #                                 country_from_heuristic_alt = pop_code_to_country.get(pop_code_for_range, 'unknown')
-    #                                 if country_from_heuristic_alt != 'unknown':
-    #                                   if extracted_country == 'unknown':
-    #                                     extracted_country = country_from_heuristic_alt
-    #                                   if extracted_type == 'unknown':
-    #                                     extracted_type = 'modern'
-    #                                   # if extracted_ethnicity == 'unknown':
-    #                                   #   extracted_ethnicity = pop_code_to_ethnicity.get(pop_code_for_range, 'unknown')
-    #                                   # if extracted_specific_location == 'unknown':
-    #                                   #   extracted_specific_location = pop_code_to_specific_loc.get(pop_code_for_range, 'unknown')
-    #                                   population_code_from_sl = pop_code_for_range
-    #                                   method_used = "structured_lookup_alt_heuristic_range_match"
-    #                                   break
-    #                                 else:
-    #                                     print(f"Alternative '{alternative_query_word_cleaned}' heuristic match found, but country unknown. Will fall to RAG below.")
-    # except:
-    #     print("pass attempt 3 in model query")
-    #     pass
-    # use the context_for_llm to detect present_ancient before using llm model
-    # retrieved_chunks_text = []
-    # if document_chunks:
-    #   for idx in range(len(document_chunks)):
-    #           retrieved_chunks_text.append(document_chunks[idx])
-    # context_for_llm = ""
-    # all_context = "\n".join(retrieved_chunks_text) # 
-    # listOfcontexts = {"chunk": chunk, 
-    #         "all_output": all_output,
-    #         "document_chunk": all_context}
-    # label, context_for_llm = chooseContextLLM(listOfcontexts, query_word) 
-    # if not context_for_llm:
-    #   label, context_for_llm = chooseContextLLM(listOfcontexts, alternative_query_word_cleaned)
-    #   if not context_for_llm:
-    #     context_for_llm = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + extracted_features
-    # if context_for_llm:
-    #   extracted_type, explain = mtdna_classifier.detect_ancient_flag(context_for_llm)
-    #   extracted_type = extracted_type.lower()
-    #   sample_type_explanation = explain
-    # 5. Execute RAG if needed (either full RAG or targeted RAG for missing fields)
-
-    # Determine if a RAG call is necessary
-    # run_rag = (extracted_country == 'unknown' or extracted_type == 'unknown')# or \
-    #            #extracted_ethnicity == 'unknown' or extracted_specific_location == 'unknown')
-    run_rag = True
-    if run_rag:
-        print("try run rag")
-        # Determine the phrase for LLM query
-        rag_query_phrase = f"'{query_word}'"
-        if alternative_query_word_cleaned and alternative_query_word_cleaned != query_word:
-            rag_query_phrase += f" or its alternative word '{alternative_query_word_cleaned}'"
-
-        # Construct a more specific semantic query phrase for embedding if structured info is available
-        semantic_query_for_embedding = rag_query_phrase # Default
-        # if extracted_country != 'unknown': # If country is known from structured lookup (for targeted RAG)
-        #     if population_code_from_sl != 'unknown':
-        #         semantic_query_for_embedding = f"ethnicity and specific location for {query_word} population {population_code_from_sl} in {extracted_country}"
-        #     else: # If pop_code not found in structured, still use country hint
-        #         semantic_query_for_embedding = f"ethnicity and specific location for {query_word} in {extracted_country}"
-        # print(f"  DEBUG: Semantic query for embedding: '{semantic_query_for_embedding}'")
-
-
-        # Determine fields to ask LLM for and output format based on what's known/needed
-        prompt_instruction_prefix = ""
-        output_format_str = ""
-
-        # Determine if it's a full RAG or targeted RAG scenario based on what's already extracted
-        is_full_rag_scenario = True#(extracted_country == 'unknown')
-
-        if is_full_rag_scenario: # Full RAG scenario
-            output_format_str = "country_name, modern/ancient/unknown"#, ethnicity, specific_location/unknown"
-            method_used = "rag_llm"
-            print(f"Proceeding to FULL RAG for {rag_query_phrase}.")
-        # else: # Targeted RAG scenario (country/type already known, need ethnicity/specific_location)
-        #     if extracted_type == "unknown":
-        #         prompt_instruction_prefix = (
-        #             f"I already know the country is {extracted_country}. "
-        #             f"{f'The population code is {population_code_from_sl}. ' if population_code_from_sl != 'unknown' else ''}"
-        #         )
-        #         #output_format_str = "modern/ancient/unknown, ethnicity, specific_location/unknown"
-        #         output_format_str = "modern/ancient/unknown"
-        #     # else:
-        #     #     prompt_instruction_prefix = (
-        #     #         f"I already know the country is {extracted_country} and the sample type is {extracted_type}. "
-        #     #         f"{f'The population code is {population_code_from_sl}. ' if population_code_from_sl != 'unknown' else ''}"
-        #     #     )
-        #     #     output_format_str = "ethnicity, specific_location/unknown"
-
-        #     method_used = "hybrid_sl_rag"
-        #     print(f"Proceeding to TARGETED RAG for {rag_query_phrase}.")
-
-
-        # Calculate embedding cost for the primary query word
-        current_embedding_cost = 0
-        # try:
-        #     query_embedding_vector = get_embedding(semantic_query_for_embedding, task_type="RETRIEVAL_QUERY")
-        #     query_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(semantic_query_for_embedding).total_tokens
-        #     current_embedding_cost += (query_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
-        #     print(f"  DEBUG: Query embedding tokens (for '{semantic_query_for_embedding}'): {query_embedding_tokens}, cost: ${current_embedding_cost:.6f}")
-
-        #     if alternative_query_word_cleaned and alternative_query_word_cleaned != query_word:
-        #         alt_embedding_vector = get_embedding(alternative_query_word_cleaned, task_type="RETRIEVAL_QUERY")
-        #         alt_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(alternative_query_word_cleaned).total_tokens
-        #         current_embedding_cost += (alt_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
-        #         print(f"  DEBUG: Alternative query ('{alternative_query_word_cleaned}') embedding tokens: {alt_embedding_tokens}, cost: ${current_embedding_cost:.6f}")
-
-        # except Exception as e:
-        #     print(f"Error getting query embedding for RAG: {e}")
-        #     return extracted_country, extracted_type, "embedding_failed", extracted_ethnicity, extracted_specific_location, total_query_cost
-
-        # if query_embedding_vector is None or query_embedding_vector.shape[0] == 0:
-        #     return extracted_country, extracted_type, "embedding_failed", extracted_ethnicity, extracted_specific_location, total_query_cost
-
-        # D, I = faiss_index.search(np.array([query_embedding_vector]), 4)
-
-        # retrieved_chunks_text = []
-        # for idx in I[0]:
-        #     if 0 <= idx < len(document_chunks):
-        #         retrieved_chunks_text.append(document_chunks[idx])
-
-        # context_for_llm = ""
-        
-        # all_context = "\n".join(retrieved_chunks_text) # 
-        print("direct to llm")
-        listOfcontexts = {"chunk": chunk, 
-            "all_output": all_output,
-            "document_chunk": chunk}
-        label, context_for_llm = chooseContextLLM(listOfcontexts, query_word) 
-        if not context_for_llm:
-          label, context_for_llm = chooseContextLLM(listOfcontexts, alternative_query_word_cleaned)
-          if not context_for_llm:
-            context_for_llm = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + extracted_features
-        #print("context for llm: ", label)    
-        # prompt_for_llm = (
-        #     f"{prompt_instruction_prefix}"
-        #     f"Given the following text snippets, analyze the entity/concept {rag_query_phrase} or the mitochondrial DNA sample in general if these specific identifiers are not explicitly found. "
-        #     f"Identify its primary associated country/geographic location. "
-        #     f"Also, determine if the genetic sample or individual mentioned is from a 'modern' (present-day living individual) "
-        #     f"or 'ancient' (e.g., prehistoric remains, archaeological sample) source. "
-        #     f"If the text does not mention whether the sample is ancient or modern, assume the sample is modern unless otherwise explicitly described as ancient or archaeological. "
-        #     f"Additionally, extract its ethnicity and a more specific location (city/district level) within the predicted country. "
-        #     f"If any information is not explicitly present in the provided text snippets, state 'unknown' for that specific piece of information. "
-        #     f"Provide only the country, sample type, ethnicity, and specific location, do not add extra explanations.\n\n"
-        #     f"Text Snippets:\n{context_for_llm}\n\n"
-        #     f"Output Format: {output_format_str}"
-        # )
-        if len(context_for_llm) > 1000*1000:
-          context_for_llm = context_for_llm[:900000]
-
-        # fix the prompt better:
-        # firstly clarify more by saying which type of organism, prioritize homo sapiens
-        features = metadata["all_features"]
-        organism = "general"
-        if features != "unknown":
-          if "organism" in features:
-            try:
-              organism = features.split("organism: ")[1].split("\n")[0]  
-            except:
-              organism = features.replace("\n","; ")  
-        explain_list = "country or sample type (modern/ancient)" #or ethnicity or specific location (province/city)"    
-        
-#         prompt_for_llm = (
-#     f"{prompt_instruction_prefix}"
-#     f"Given the following text snippets, analyze the entity/concept {rag_query_phrase} or the mitochondrial DNA sample in general if these specific identifiers are not explicitly found. "
-#     f"Identify its primary associated country/geographic location. "
-#     f"Also, determine if the genetic sample or individual mentioned is from a 'modern' (present-day living individual) "
-#     f"or 'ancient' (e.g., prehistoric remains, archaeological sample) source. "
-#     f"If the text does not mention whether the sample is ancient or modern, assume the sample is modern unless otherwise explicitly described as ancient or archaeological. "
-#     f"Provide only {output_format_str}. "
-#     f"If any information is not explicitly present in the provided text snippets, state 'unknown' for that specific piece of information. "
-#     f"If the country or sample type (modern/ancient) is not 'unknown', write 1 sentence after the output explaining how you inferred it from the text (one sentence for each)."
-#     f"\n\nText Snippets:\n{context_for_llm}\n\n"
-#     f"Output Format: {output_format_str}"
-# )
-        
-#         prompt_for_llm = (
-#     f"{prompt_instruction_prefix}"
-#     f"Given the following text snippets, analyze the entity/concept {rag_query_phrase} or the mitochondrial DNA sample in {organism} if these specific identifiers are not explicitly found. "
-#     f"Identify its primary associated country/geographic location. "
-#     f"Also, determine if the genetic sample or individual mentioned is from a 'modern' (present-day living individual) "
-#     f"or 'ancient' (e.g., prehistoric remains, archaeological sample) source. "
-#     f"If the text does not mention whether the sample is ancient or modern, assume the sample is modern unless otherwise explicitly described as ancient or archaeological. "
-#     f"Provide only {output_format_str}. "
-#     f"If any information is not explicitly present in the provided text snippets, state 'unknown' for that specific piece of information. "
-#     f"If the {explain_list} is not 'unknown', write 1 sentence after the output explaining how you inferred it from the text (one sentence for each)."
-#     f"\n\nText Snippets:\n{context_for_llm}\n\n"
-#     f"Output Format: {output_format_str}"
-# )
-#         prompt_for_llm = (
-#     f"{prompt_instruction_prefix}"
-#     f"Given the following text snippets, analyze the entity/concept {rag_query_phrase} "
-#     f"or the mitochondrial DNA sample in {organism} if these identifiers are not explicitly found. "
-#     f"Identify its **primary associated geographic location**, preferring the most specific available: "
-#     f"first try to determine the exact country; if no country is explicitly mentioned, then provide "
-#     f"the next most specific region, continent, island, or other clear geographic area mentioned. "
-#     f"If no geographic clues at all are present, state 'unknown' for location. "
-#     f"Also, determine if the genetic sample is from a 'modern' (present-day living individual) "
-#     f"or 'ancient' (prehistoric/archaeological) source. "
-#     f"If the text does not specify ancient or archaeological context, assume 'modern'. "
-#     f"Provide only {output_format_str}. "
-#     f"If any information is not explicitly present, use the fallback rules above before defaulting to 'unknown'. "
-#     f"For each non-'unknown' field in {explain_list}, write one sentence explaining how it was inferred from the text (one sentence for each)."
-#     f"\n\nText Snippets:\n{context_for_llm}\n\n"
-#     f"Output Format: {output_format_str}"
-# )
-        prompt_for_llm = (
-    f"{prompt_instruction_prefix}"
-    f"Given the following text snippets, analyze the entity/concept {rag_query_phrase} "
-    f"or the mitochondrial DNA sample in {organism} if these identifiers are not explicitly found. "
-    f"Identify its **primary associated geographic location**, preferring the most specific available: "
-    f"first try to determine the exact country; if no country is explicitly mentioned, then provide "
-    f"the next most specific region, continent, island, or other clear geographic area mentioned. "
-    f"If no geographic clues at all are present, state 'unknown' for location. "
-    f"Also, determine if the genetic sample is from a 'modern' (present-day living individual) "
-    f"or 'ancient' (prehistoric/archaeological) source. "
-    f"If the text does not specify ancient or archaeological context, assume 'modern'. "
-    f"Provide only {output_format_str}. "
-    f"If any information is not explicitly present, use the fallback rules above before defaulting to 'unknown'. "
-    f"For each non-'unknown' field in {explain_list}, write one sentence explaining how it was inferred from the text "
-    f"(one sentence for each). "
-    f"Format your answer so that:\n"
-    f"1. The **first line** contains only the {output_format_str} answer.\n"
-    f"2. The **second line onward** contains the explanations based on the order of the non-unknown {output_format_str} answer.\n"
-    f"\nText Snippets:\n{context_for_llm}\n\n"
-    f"Output Format Example:\nBrunei, modern.\n"
-    f"The text explicitly states BRU18 in the context of brunei (borneo), indicating the country and a broader geographic region within that country."
-    f"The study is published in a journal, implying research on living individuals, hence modern."
-)
-
-        if model_ai:
-          print("back up to ", model_ai)
-          llm_response_text, model_instance = call_llm_api(prompt_for_llm, model=model_ai)
-        else:
-          print("still 2.5 flash gemini")
-          llm_response_text, model_instance = call_llm_api(prompt_for_llm)
-        print("\n--- DEBUG INFO FOR RAG ---")
-        print("Retrieved Context Sent to LLM (first 500 chars):")
-        print(context_for_llm[:500] + "..." if len(context_for_llm) > 500 else context_for_llm)
-        print("\nRaw LLM Response:")
-        print(llm_response_text)
-        print("--- END DEBUG INFO ---")
-
-        llm_cost = 0
-        if model_instance:
-            try:
-                input_llm_tokens = global_llm_model_for_counting_tokens.count_tokens(prompt_for_llm).total_tokens
-                output_llm_tokens = global_llm_model_for_counting_tokens.count_tokens(llm_response_text).total_tokens
-                print(f"  DEBUG: LLM Input tokens: {input_llm_tokens}")
-                print(f"  DEBUG: LLM Output tokens: {output_llm_tokens}")
-                llm_cost = (input_llm_tokens / 1000) * PRICE_PER_1K_INPUT_LLM + \
-                           (output_llm_tokens / 1000) * PRICE_PER_1K_OUTPUT_LLM
-                print(f"  DEBUG: Estimated LLM cost: ${llm_cost:.6f}")
-            except Exception as e:
-                print(f"  DEBUG: Error counting LLM tokens: {e}")
-                llm_cost = 0
-
-        total_query_cost += current_embedding_cost + llm_cost
-        print(f"  DEBUG: Total estimated cost for this RAG query: ${total_query_cost:.6f}")
-        # Parse the LLM's response based on the Output Format actually used
-        # if output_format_str == "ethnicity, specific_location/unknown": # Targeted RAG output
-        #     extracted_ethnicity,extracted_specific_location = clean_llm_output(llm_response_text, output_format_str)
-        # elif output_format_str == "modern/ancient/unknown, ethnicity, specific_location/unknown":
-        #     extracted_type, extracted_ethnicity,extracted_specific_location=clean_llm_output(llm_response_text, output_format_str) 
-        # else: # Full RAG output (country, type, ethnicity, specific_location)
-        #     extracted_country,extracted_type, extracted_ethnicity,extracted_specific_location=clean_llm_output(llm_response_text, output_format_str) 
-        metadata_list = parse_multi_sample_llm_output(llm_response_text, output_format_str)
-        # merge_metadata = merge_metadata_outputs(metadata_list)
-        # if output_format_str == "country_name, modern/ancient/unknown":
-        #   extracted_country, extracted_type = merge_metadata["country"], merge_metadata["sample_type"]  
-        #   country_explanation,sample_type_explanation = merge_metadata["country_explanation"], merge_metadata["sample_type_explanation"]
-        # elif output_format_str == "modern/ancient/unknown":
-        #   extracted_type = merge_metadata["sample_type"]  
-        #   sample_type_explanation = merge_metadata["sample_type_explanation"]
-        # for the output_format that is not default
-        if output_format_str == "country_name, modern/ancient/unknown":
-          outputs = output_format_str.split(", ")
-          extracted_country, extracted_type = metadata_list[outputs[0]]["answer"], metadata_list[outputs[1]]["answer"]  
-          country_explanation,sample_type_explanation = metadata_list[outputs[0]][outputs[0]+"_explanation"], metadata_list[outputs[1]][outputs[1]+"_explanation"]
-          # extracted_ethnicity, extracted_specific_location = metadata_list[outputs[2]]["answer"], metadata_list[outputs[3]]["answer"]  
-          # ethnicity_explanation, specific_loc_explanation = metadata_list[outputs[2]][outputs[2]+"_explanation"], metadata_list[outputs[3]][outputs[3]+"_explanation"]
-    # 6. Optional: Second LLM call for specific_location from general knowledge if still unknown
-    # if extracted_specific_location == 'unknown':
-    #     # Check if we have enough info to ask general knowledge LLM
-    #     if extracted_country != 'unknown' and extracted_ethnicity != 'unknown':
-    #         print(f"  DEBUG: Specific location still unknown. Querying general knowledge LLM from '{extracted_ethnicity}' and '{extracted_country}'.")
-
-    #         general_knowledge_prompt = (
-    #             f"Based on general knowledge, what is a highly specific location (city or district) "
-    #             f"associated with the ethnicity '{extracted_ethnicity}' in '{extracted_country}'? "
-    #             f"Consider the context of scientific studies on human genetics, if known. "
-    #             f"If no common specific location is known, state 'unknown'. "
-    #             f"Provide only the city or district name, or 'unknown'."
-    #         )
-
-    #         general_llm_response, general_llm_model_instance = call_llm_api(general_knowledge_prompt, model_name='gemini-1.5-flash-latest')
-
-    #         if general_llm_response and general_llm_response.lower().strip() != 'unknown':
-    #             extracted_specific_location = general_llm_response.strip() + " (predicted from general knowledge)"
-    #             # Add cost of this second LLM call
-    #             if general_llm_model_instance:
-    #                 try:
-    #                     gk_input_tokens = general_llm_model_instance.count_tokens(general_knowledge_prompt).total_tokens
-    #                     gk_output_tokens = general_llm_model_instance.count_tokens(general_llm_response).total_tokens
-    #                     gk_cost = (gk_input_tokens / 1000) * PRICE_PER_1K_INPUT_LLM + \
-    #                               (gk_output_tokens / 1000) * PRICE_PER_1K_OUTPUT_LLM
-    #                     print(f"  DEBUG: General Knowledge LLM cost to predict specific location alone: ${gk_cost:.6f}")
-    #                     total_query_cost += gk_cost # Accumulate cost
-    #                 except Exception as e:
-    #                     print(f"  DEBUG: Error counting GK LLM tokens: {e}")
-    #         else:
-    #             print("  DEBUG: General knowledge LLM returned unknown or empty for specific location.")
-    # # 6. Optional: Second LLM call for ethnicity from general knowledge if still unknown
-    # if extracted_ethnicity == 'unknown':
-    #     # Check if we have enough info to ask general knowledge LLM
-    #     if extracted_country != 'unknown' and extracted_specific_location != 'unknown':
-    #         print(f"  DEBUG: Ethnicity still unknown. Querying general knowledge LLM from '{extracted_specific_location}' and '{extracted_country}'.")
-
-    #         general_knowledge_prompt = (
-    #             f"Based on general knowledge, what is a highly ethnicity (population) "
-    #             f"associated with the specific location '{extracted_specific_location}' in '{extracted_country}'? "
-    #             f"Consider the context of scientific studies on human genetics, if known. "
-    #             f"If no common ethnicity is known, state 'unknown'. "
-    #             f"Provide only the ethnicity or popluation name, or 'unknown'."
-    #         )
-
-    #         general_llm_response, general_llm_model_instance = call_llm_api(general_knowledge_prompt, model_name='gemini-1.5-flash-latest')
-
-    #         if general_llm_response and general_llm_response.lower().strip() != 'unknown':
-    #             extracted_ethnicity = general_llm_response.strip() + " (predicted from general knowledge)"
-    #             # Add cost of this second LLM call
-    #             if general_llm_model_instance:
-    #                 try:
-    #                     gk_input_tokens = general_llm_model_instance.count_tokens(general_knowledge_prompt).total_tokens
-    #                     gk_output_tokens = general_llm_model_instance.count_tokens(general_llm_response).total_tokens
-    #                     gk_cost = (gk_input_tokens / 1000) * PRICE_PER_1K_INPUT_LLM + \
-    #                               (gk_output_tokens / 1000) * PRICE_PER_1K_OUTPUT_LLM
-    #                     print(f"  DEBUG: General Knowledge LLM cost to predict ethnicity alone: ${gk_cost:.6f}")
-    #                     total_query_cost += gk_cost # Accumulate cost
-    #                 except Exception as e:
-    #                     print(f"  DEBUG: Error counting GK LLM tokens: {e}")
-    #         else:
-    #             print("  DEBUG: General knowledge LLM returned unknown or empty for ethnicity.")            
-
-
-    #return extracted_country, extracted_type, method_used, extracted_ethnicity, extracted_specific_location, total_query_cost
-    return extracted_country, extracted_type, method_used, country_explanation, sample_type_explanation, total_query_cost
+import re
+import pycountry
+from docx import Document
+import json
+import os
+import numpy as np
+import faiss
+from collections import defaultdict
+import ast # For literal_eval
+import math # For ceiling function
+import data_preprocess
+import mtdna_classifier
+# --- IMPORTANT: UNCOMMENT AND CONFIGURE YOUR REAL API KEY ---
+import google.generativeai as genai
+
+#genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
+genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP"))
+
+import nltk
+from nltk.corpus import stopwords
+try:
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('stopwords')
+nltk.download('punkt_tab')    
+# # --- Define Pricing Constants (for Gemini 1.5 Flash & text-embedding-004) ---
+# # Prices are per 1,000 tokens
+# PRICE_PER_1K_INPUT_LLM = 0.000075  # $0.075 per 1M tokens
+# PRICE_PER_1K_OUTPUT_LLM = 0.0003   # $0.30 per 1M tokens
+# PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
+
+# Gemini 2.5 Flash-Lite pricing per 1,000 tokens
+PRICE_PER_1K_INPUT_LLM = 0.00010      # $0.10 per 1M input tokens
+PRICE_PER_1K_OUTPUT_LLM = 0.00040     # $0.40 per 1M output tokens
+
+# Embedding-001 pricing per 1,000 input tokens
+PRICE_PER_1K_EMBEDDING_INPUT = 0.00015  # $0.15 per 1M input tokens
+# --- API Functions (REAL API FUNCTIONS) ---
+
+# def get_embedding(text, task_type="RETRIEVAL_DOCUMENT"):
+#     """Generates an embedding for the given text using a Google embedding model."""
+#     try:
+#         result = genai.embed_content(
+#             model="models/text-embedding-004", # Specify the embedding model
+#             content=text,
+#             task_type=task_type
+#         )
+#         return np.array(result['embedding']).astype('float32')
+#     except Exception as e:
+#         print(f"Error getting embedding: {e}")
+#         return np.zeros(768, dtype='float32')
+def get_embedding(text, task_type="RETRIEVAL_DOCUMENT"):
+    """Safe Gemini 1.5 embedding call with fallback."""
+    import numpy as np
+    try:
+        if not text or len(text.strip()) == 0:
+            raise ValueError("Empty text cannot be embedded.")
+        result = genai.embed_content(
+            model="models/text-embedding-004",
+            content=text,
+            task_type=task_type
+        )
+        return np.array(result['embedding'], dtype='float32')
+    except Exception as e:
+        print(f"❌ Embedding error: {e}")
+        return np.zeros(768, dtype='float32')
+
+
+def call_llm_api(prompt, model_name="gemini-2.5-flash-lite"):#'gemini-1.5-flash-latest'):
+    """Calls a Google Gemini LLM with the given prompt."""
+    try:
+        model = genai.GenerativeModel(model_name)
+        response = model.generate_content(prompt)
+        return response.text, model # Return model instance for token counting
+    except Exception as e:
+        print(f"Error calling LLM: {e}")
+        return "Error: Could not get response from LLM API.", None
+
+
+# --- Core Document Processing Functions (All previously provided and fixed) ---
+
+def read_docx_text(path):
+    """
+    Reads text and extracts potential table-like strings from a .docx document.
+    Separates plain text from structured [ [ ] ] list-like tables.
+    Also attempts to extract a document title.
+    """
+    doc = Document(path)
+    plain_text_paragraphs = []
+    table_strings = []
+    document_title = "Unknown Document Title" # Default
+
+    # Attempt to extract the document title from the first few paragraphs
+    title_paragraphs = [p.text.strip() for p in doc.paragraphs[:5] if p.text.strip()]
+    if title_paragraphs:
+        # A heuristic to find a title: often the first or second non-empty paragraph
+        # or a very long first paragraph if it's the title
+        if len(title_paragraphs[0]) > 50 and "Human Genetics" not in title_paragraphs[0]:
+            document_title = title_paragraphs[0]
+        elif len(title_paragraphs) > 1 and len(title_paragraphs[1]) > 50 and "Human Genetics" not in title_paragraphs[1]:
+            document_title = title_paragraphs[1]
+        elif any("Complete mitochondrial genomes" in p for p in title_paragraphs):
+            # Fallback to a known title phrase if present
+            document_title = "Complete mitochondrial genomes of Thai and Lao populations indicate an ancient origin of Austroasiatic groups and demic diffusion in the spread of Tai–Kadai languages"
+
+    current_table_lines = []
+    in_table_parsing_mode = False
+
+    for p in doc.paragraphs:
+        text = p.text.strip()
+        if not text:
+            continue
+
+        # Condition to start or continue table parsing
+        if text.startswith("## Table "): # Start of a new table section
+            if in_table_parsing_mode and current_table_lines:
+                table_strings.append("\n".join(current_table_lines))
+            current_table_lines = [text] # Include the "## Table X" line
+            in_table_parsing_mode = True
+        elif in_table_parsing_mode and (text.startswith("[") or text.startswith('"')):
+            # Continue collecting lines if we're in table mode and it looks like table data
+            # Table data often starts with '[' for lists, or '"' for quoted strings within lists.
+            current_table_lines.append(text)
+        else:
+            # If not in table mode, or if a line doesn't look like table data,
+            # then close the current table (if any) and add the line to plain text.
+            if in_table_parsing_mode and current_table_lines:
+                table_strings.append("\n".join(current_table_lines))
+                current_table_lines = []
+            in_table_parsing_mode = False
+            plain_text_paragraphs.append(text)
+
+    # After the loop, add any remaining table lines
+    if current_table_lines:
+        table_strings.append("\n".join(current_table_lines))
+
+    return "\n".join(plain_text_paragraphs), table_strings, document_title
+
+# --- Structured Data Extraction and RAG Functions ---
+
+def parse_literal_python_list(table_str):
+    list_match = re.search(r'(\[\s*\[\s*(?:.|\n)*?\s*\]\s*\])', table_str)
+    #print("Debug: list_match object (before if check):", list_match)
+    if not list_match:
+        if "table" in table_str.lower(): # then the table doest have the "]]" at the end
+            table_str += "]]"
+            list_match = re.search(r'(\[\s*\[\s*(?:.|\n)*?\s*\]\s*\])', table_str)
+    if list_match:
+        try:
+            matched_string = list_match.group(1)
+            #print("Debug: Matched string for literal_eval:", matched_string)
+            return ast.literal_eval(matched_string)
+        except (ValueError, SyntaxError) as e:
+            print(f"Error evaluating literal: {e}")
+            return []
+    return []
+
+
+_individual_code_parser = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
+def _parse_individual_code_parts(code_str):
+    match = _individual_code_parser.search(code_str)
+    if match:
+        return match.group(1), match.group(2)
+    return None, None
+
+
+def parse_sample_id_to_population_code(plain_text_content):
+    sample_id_map = {}
+    contiguous_ranges_data = defaultdict(list)
+
+    #section_start_marker = "The sample identification of each population is as follows:"
+    section_start_marker = ["The sample identification of each population is as follows:","## table"]
+    
+    for s in section_start_marker:
+      relevant_text_search = re.search(
+          re.escape(s.lower()) + r"\s*(.*?)(?=\n##|\Z)",
+          plain_text_content.lower(),
+          re.DOTALL
+      )
+      if relevant_text_search: 
+        break
+      
+    if not relevant_text_search:
+        print("Warning: 'Sample ID Population Code' section start marker not found or block empty.")
+        return sample_id_map, contiguous_ranges_data
+
+    relevant_text_block = relevant_text_search.group(1).strip()
+
+    # print(f"\nDEBUG_PARSING: --- Start of relevant_text_block (first 500 chars) ---")
+    # print(relevant_text_block[:500])
+    # print(f"DEBUG_PARSING: --- End of relevant_text_block (last 500 chars) ---")
+    # print(relevant_text_block[-500:])
+    # print(f"DEBUG_PARSING: Relevant text block length: {len(relevant_text_block)}")
+
+    mapping_pattern = re.compile(
+    r'\b([A-Z0-9]+\d+)(?:-([A-Z0-9]+\d+))?\s+([A-Z0-9]+)\b', # Changed the last group
+    re.IGNORECASE)
+
+    range_expansion_count = 0
+    direct_id_count = 0
+    total_matches_found = 0
+    for match in mapping_pattern.finditer(relevant_text_block):
+        total_matches_found += 1
+        id1_full_str, id2_full_str_opt, pop_code = match.groups()
+
+        #print(f"  DEBUG_PARSING: Matched: '{match.group(0)}'")
+
+        pop_code_upper = pop_code.upper()
+
+        id1_prefix, id1_num_str = _parse_individual_code_parts(id1_full_str)
+        if id1_prefix is None:
+            #print(f"    DEBUG_PARSING: Failed to parse ID1: {id1_full_str}. Skipping this mapping.")
+            continue
+
+        if id2_full_str_opt:
+            id2_prefix_opt, id2_num_str_opt = _parse_individual_code_parts(id2_full_str_opt)
+            if id2_prefix_opt is None:
+                #print(f"    DEBUG_PARSING: Failed to parse ID2: {id2_full_str_opt}. Treating {id1_full_str} as single ID1.")
+                sample_id_map[f"{id1_prefix.upper()}{id1_num_str}"] = pop_code_upper
+                direct_id_count += 1
+                continue
+
+            #print(f"    DEBUG_PARSING: Comparing prefixes: '{id1_prefix.lower()}' vs '{id2_prefix_opt.lower()}'")
+            if id1_prefix.lower() == id2_prefix_opt.lower():
+                #print(f"    DEBUG_PARSING: ---> Prefixes MATCH for range expansion! Range: {id1_prefix}{id1_num_str}-{id2_prefix_opt}{id2_num_str_opt}")
+                try:
+                    start_num = int(id1_num_str)
+                    end_num = int(id2_num_str_opt)
+                    for num in range(start_num, end_num + 1):
+                        sample_id = f"{id1_prefix.upper()}{num}"
+                        sample_id_map[sample_id] = pop_code_upper
+                        range_expansion_count += 1
+                    contiguous_ranges_data[id1_prefix.upper()].append(
+                        (start_num, end_num, pop_code_upper)
+                    )
+                except ValueError:
+                    print(f"        DEBUG_PARSING: ValueError in range conversion for {id1_num_str}-{id2_num_str_opt}. Adding endpoints only.")
+                    sample_id_map[f"{id1_prefix.upper()}{id1_num_str}"] = pop_code_upper
+                    sample_id_map[f"{id2_prefix_opt.upper()}{id2_num_str_opt}"] = pop_code_upper
+                    direct_id_count += 2
+            else:
+                #print(f"    DEBUG_PARSING: Prefixes MISMATCH for range: '{id1_prefix}' vs '{id2_prefix_opt}'. Adding endpoints only.")
+                sample_id_map[f"{id1_prefix.upper()}{id1_num_str}"] = pop_code_upper
+                sample_id_map[f"{id2_prefix_opt.upper()}{id2_num_str_opt}"] = pop_code_upper
+                direct_id_count += 2
+        else:
+            sample_id_map[f"{id1_prefix.upper()}{id1_num_str}"] = pop_code_upper
+            direct_id_count += 1
+
+    # print(f"DEBUG_PARSING: Total matches found by regex: {total_matches_found}.")
+    # print(f"DEBUG_PARSING: Parsed sample IDs: {len(sample_id_map)} total entries.")
+    # print(f"DEBUG_PARSING:   (including {range_expansion_count} from range expansion and {direct_id_count} direct ID/endpoint entries).")
+    return sample_id_map, contiguous_ranges_data
+
+country_keywords_regional_overrides = {
+    "north thailand": "Thailand", "central thailand": "Thailand",
+    "northeast thailand": "Thailand", "east myanmar": "Myanmar", "west thailand": "Thailand",
+    "central india": "India", "east india": "India", "northeast india": "India",
+    "south sibera": "Russia", "siberia": "Russia", "yunnan": "China", #"tibet": "China",
+    "sumatra": "Indonesia", "borneo": "Indonesia",
+    "northern mindanao": "Philippines", "west malaysia": "Malaysia",
+    "mongolia": "China",
+    "beijing": "China",
+    "north laos": "Laos", "central laos": "Laos",
+    "east myanmar": "Myanmar", "west myanmar": "Myanmar"}
+
+# Updated get_country_from_text function
+def get_country_from_text(text):
+    text_lower = text.lower()
+
+    # 1. Use pycountry for official country names and common aliases
+    for country in pycountry.countries:
+        # Check full name match first
+        if text_lower == country.name.lower():
+            return country.name
+        
+        # Safely check for common_name
+        if hasattr(country, 'common_name') and text_lower == country.common_name.lower():
+            return country.common_name
+            
+        # Safely check for official_name
+        if hasattr(country, 'official_name') and text_lower == country.official_name.lower():
+            return country.official_name
+
+        # Check if country name is part of the text (e.g., 'Thailand' in 'Thailand border')
+        if country.name.lower() in text_lower:
+            return country.name
+            
+        # Safely check if common_name is part of the text
+        if hasattr(country, 'common_name') and country.common_name.lower() in text_lower:
+            return country.common_name
+    # 2. Prioritize specific regional overrides
+    for keyword, country in country_keywords_regional_overrides.items():
+        if keyword in text_lower:
+            return country
+    # 3. Check for broader regions that you want to map to "unknown" or a specific country
+    if "north asia" in text_lower or "southeast asia" in text_lower or "east asia" in text_lower:
+        return "unknown"
+
+    return "unknown"
+
+# Get the list of English stop words from NLTK
+non_meaningful_pop_names = set(stopwords.words('english'))
+
+def parse_population_code_to_country(plain_text_content, table_strings):
+    pop_code_country_map = {}
+    pop_code_ethnicity_map = {} # NEW: To store ethnicity for structured lookup
+    pop_code_specific_loc_map = {} # NEW: To store specific location for structured lookup
+
+    # Regex for parsing population info in structured lists and general text
+    # This pattern captures: (Pop Name/Ethnicity) (Pop Code) (Region/Specific Location) (Country) (Linguistic Family)
+    # The 'Pop Name/Ethnicity' (Group 1) is often the ethnicity
+    pop_info_pattern = re.compile(
+          r'([A-Za-z\s]+?)\s+([A-Z]+\d*)\s+'      # Pop Name (Group 1), Pop Code (Group 2) - Changed \d+ to \d* for codes like 'SH'
+          r'([A-Za-z\s\(\)\-,\/]+?)\s+'          # Region/Specific Location (Group 3)
+          r'(North+|South+|West+|East+|Thailand|Laos|Cambodia|Myanmar|Philippines|Indonesia|Malaysia|China|India|Taiwan|Vietnam|Russia|Nepal|Japan|South Korea)\b' # Country (Group 4)
+          r'(?:.*?([A-Za-z\s\-]+))?\s*'          # Optional Linguistic Family (Group 5), made optional with ?, followed by optional space
+          r'(\d+(?:\s+\d+\.?\d*)*)?', # Match all the numbers (Group 6) - made optional
+          re.IGNORECASE
+      )
+    for table_str in table_strings:
+        table_data = parse_literal_python_list(table_str)
+        if table_data:
+            is_list_of_lists = bool(table_data) and isinstance(table_data[0], list)
+            if is_list_of_lists:
+                for row_idx, row in enumerate(table_data):
+                    row_text = " ".join(map(str, row))
+                    match = pop_info_pattern.search(row_text)
+                    if match:
+                        pop_name = match.group(1).strip()
+                        pop_code = match.group(2).upper()
+                        specific_loc_text = match.group(3).strip()
+                        country_text = match.group(4).strip()
+                        linguistic_family = match.group(5).strip() if match.group(5) else 'unknown'
+
+                        final_country = get_country_from_text(country_text)
+                        if final_country == 'unknown': # Try specific loc text for country if direct country is not found
+                            final_country = get_country_from_text(specific_loc_text)
+
+                        if pop_code:
+                            pop_code_country_map[pop_code] = final_country
+
+                            # Populate ethnicity map (often Pop Name is ethnicity)
+                            pop_code_ethnicity_map[pop_code] = pop_name
+
+                            # Populate specific location map
+                            pop_code_specific_loc_map[pop_code] = specific_loc_text # Store as is from text
+            else:
+                row_text = " ".join(map(str, table_data))   
+                match = pop_info_pattern.search(row_text)
+                if match:
+                    pop_name = match.group(1).strip()
+                    pop_code = match.group(2).upper()
+                    specific_loc_text = match.group(3).strip()
+                    country_text = match.group(4).strip()
+                    linguistic_family = match.group(5).strip() if match.group(5) else 'unknown'
+
+                    final_country = get_country_from_text(country_text)
+                    if final_country == 'unknown': # Try specific loc text for country if direct country is not found
+                        final_country = get_country_from_text(specific_loc_text)
+
+                    if pop_code:
+                        pop_code_country_map[pop_code] = final_country
+
+                        # Populate ethnicity map (often Pop Name is ethnicity)
+                        pop_code_ethnicity_map[pop_code] = pop_name
+
+                        # Populate specific location map
+                        pop_code_specific_loc_map[pop_code] = specific_loc_text # Store as is from text
+
+                        # # Special case refinements for ethnicity/location if more specific rules are known from document:
+                        # if pop_name.lower() == "khon mueang": # and specific conditions if needed
+                        #     pop_code_ethnicity_map[pop_code] = "Khon Mueang"
+                        #     # If Khon Mueang has a specific city/district, add here
+                        #     # e.g., if 'Chiang Mai' is directly linked to KM1 in a specific table
+                        #     # pop_code_specific_loc_map[pop_code] = "Chiang Mai"
+                        # elif pop_name.lower() == "lawa":
+                        #      pop_code_ethnicity_map[pop_code] = "Lawa"
+                        # # Add similar specific rules for other populations (e.g., Mon for MO1, MO2, MO3)
+                        # elif pop_name.lower() == "mon":
+                        #     pop_code_ethnicity_map[pop_code] = "Mon"
+                        #     # For MO2: "West Thailand (Thailand Myanmar border)" -> no city
+                        #     # For MO3: "East Myanmar (Thailand Myanmar border)" -> no city
+                        #     # If the doc gives "Bangkok" for MO4, add it here for MO4's actual specific_location.
+                        # # etc.
+
+    # Fallback to parsing general plain text content (sentences)
+    sentences = data_preprocess.extract_sentences(plain_text_content)
+    for s in sentences: # Still focusing on just this one sentence
+      # Use re.finditer to get all matches
+      matches = pop_info_pattern.finditer(s)
+      pop_name, pop_code, specific_loc_text, country_text = "unknown", "unknown", "unknown", "unknown"
+      for match in matches:
+          if match.group(1):
+            pop_name = match.group(1).strip()
+          if match.group(2):  
+            pop_code = match.group(2).upper()
+          if match.group(3):  
+            specific_loc_text = match.group(3).strip()
+          if match.group(4):  
+            country_text = match.group(4).strip()
+          # linguistic_family = match.group(5).strip() if match.group(5) else 'unknown' # Already captured by pop_info_pattern
+
+          final_country = get_country_from_text(country_text)
+          if final_country == 'unknown':
+              final_country = get_country_from_text(specific_loc_text)
+
+          if pop_code.lower() not in non_meaningful_pop_names:
+            if final_country.lower() not in non_meaningful_pop_names:
+              pop_code_country_map[pop_code] = final_country
+            if pop_name.lower() not in non_meaningful_pop_names:  
+              pop_code_ethnicity_map[pop_code] = pop_name # Default ethnicity from Pop Name
+            if specific_loc_text.lower() not in non_meaningful_pop_names:  
+              pop_code_specific_loc_map[pop_code] = specific_loc_text
+
+              # Specific rules for ethnicity/location in plain text:
+              if pop_name.lower() == "khon mueang":
+                  pop_code_ethnicity_map[pop_code] = "Khon Mueang"
+              elif pop_name.lower() == "lawa":
+                  pop_code_ethnicity_map[pop_code] = "Lawa"
+              elif pop_name.lower() == "mon":
+                  pop_code_ethnicity_map[pop_code] = "Mon"
+              elif pop_name.lower() == "seak": # Added specific rule for Seak
+                  pop_code_ethnicity_map[pop_code] = "Seak"
+              elif pop_name.lower() == "nyaw": # Added specific rule for Nyaw
+                  pop_code_ethnicity_map[pop_code] = "Nyaw"
+              elif pop_name.lower() == "nyahkur": # Added specific rule for Nyahkur
+                  pop_code_ethnicity_map[pop_code] = "Nyahkur"
+              elif pop_name.lower() == "suay": # Added specific rule for Suay
+                  pop_code_ethnicity_map[pop_code] = "Suay"
+              elif pop_name.lower() == "soa": # Added specific rule for Soa
+                  pop_code_ethnicity_map[pop_code] = "Soa"
+              elif pop_name.lower() == "bru": # Added specific rule for Bru
+                  pop_code_ethnicity_map[pop_code] = "Bru"
+              elif pop_name.lower() == "khamu": # Added specific rule for Khamu
+                  pop_code_ethnicity_map[pop_code] = "Khamu"
+
+    return pop_code_country_map, pop_code_ethnicity_map, pop_code_specific_loc_map
+
+def general_parse_population_code_to_country(plain_text_content, table_strings):
+    pop_code_country_map = {}
+    pop_code_ethnicity_map = {}
+    pop_code_specific_loc_map = {}
+    sample_id_to_pop_code = {}
+
+    for table_str in table_strings:
+        table_data = parse_literal_python_list(table_str)
+        if not table_data or not isinstance(table_data[0], list):
+            continue
+
+        header_row = [col.lower() for col in table_data[0]]
+        header_map = {col: idx for idx, col in enumerate(header_row)}
+
+        # MJ17: Direct PopCode → Country
+        if 'id' in header_map and 'country' in header_map:
+            for row in table_strings[1:]:
+                row = parse_literal_python_list(row)[0]
+                if len(row) < len(header_row):
+                    continue
+                pop_code = str(row[header_map['id']]).strip()
+                country = str(row[header_map['country']]).strip()
+                province = row[header_map['province']].strip() if 'province' in header_map else 'unknown'
+                pop_group = row[header_map['population group / region']].strip() if 'population group / region' in header_map else 'unknown'
+                pop_code_country_map[pop_code] = country
+                pop_code_specific_loc_map[pop_code] = province
+                pop_code_ethnicity_map[pop_code] = pop_group
+
+        # A1YU101 or EBK/KSK: SampleID → PopCode
+        elif 'sample id' in header_map and 'population code' in header_map:
+            for row in table_strings[1:]:
+                row = parse_literal_python_list(row)[0]
+                if len(row) < 2:
+                    continue
+                sample_id = row[header_map['sample id']].strip().upper()
+                pop_code = row[header_map['population code']].strip().upper()
+                sample_id_to_pop_code[sample_id] = pop_code
+
+        # PopCode → Country (A1YU101/EBK mapping)
+        elif 'population code' in header_map and 'country' in header_map:
+            for row in table_strings[1:]:
+                row = parse_literal_python_list(row)[0]
+                if len(row) < 2:
+                    continue
+                pop_code = row[header_map['population code']].strip().upper()
+                country = row[header_map['country']].strip()
+                pop_code_country_map[pop_code] = country
+
+    return pop_code_country_map, pop_code_ethnicity_map, pop_code_specific_loc_map, sample_id_to_pop_code
+
+def chunk_text(text, chunk_size=500, overlap=50):
+    """Splits text into chunks (by words) with overlap."""
+    chunks = []
+    words = text.split()
+    num_words = len(words)
+
+    start = 0
+    while start < num_words:
+        end = min(start + chunk_size, num_words)
+        chunk = " ".join(words[start:end])
+        chunks.append(chunk)
+
+        if end == num_words:
+            break
+        start += chunk_size - overlap # Move start by (chunk_size - overlap)
+    return chunks
+
+def build_vector_index_and_data(doc_path, index_path="faiss_index.bin", chunks_path="document_chunks.json", structured_path="structured_lookup.json"):
+    """
+    Reads document, builds structured lookup, chunks remaining text, embeds chunks,
+    and builds/saves a FAISS index.
+    """
+    print("Step 1: Reading document and extracting structured data...")
+    # plain_text_content, table_strings, document_title = read_docx_text(doc_path) # Get document_title here
+
+    # sample_id_map, contiguous_ranges_data = parse_sample_id_to_population_code(plain_text_content)
+    # pop_code_to_country, pop_code_to_ethnicity, pop_code_to_specific_loc = parse_population_code_to_country(plain_text_content, table_strings)
+
+    # master_structured_lookup = {}
+    # master_structured_lookup['document_title'] = document_title # Store document title
+    # master_structured_lookup['sample_id_map'] = sample_id_map
+    # master_structured_lookup['contiguous_ranges'] = dict(contiguous_ranges_data)
+    # master_structured_lookup['pop_code_to_country'] = pop_code_to_country
+    # master_structured_lookup['pop_code_to_ethnicity'] = pop_code_to_ethnicity # NEW: Store pop_code to ethnicity map
+    # master_structured_lookup['pop_code_to_specific_loc'] = pop_code_to_specific_loc # NEW: Store pop_code to specific_loc map
+
+
+    # # Final consolidation: Use sample_id_map to derive full info for queries
+    # final_structured_entries = {}
+    # for sample_id, pop_code in master_structured_lookup['sample_id_map'].items():
+    #     country = master_structured_lookup['pop_code_to_country'].get(pop_code, 'unknown')
+    #     ethnicity = master_structured_lookup['pop_code_to_ethnicity'].get(pop_code, 'unknown') # Retrieve ethnicity
+    #     specific_location = master_structured_lookup['pop_code_to_specific_loc'].get(pop_code, 'unknown') # Retrieve specific location
+
+    #     final_structured_entries[sample_id] = {
+    #         'population_code': pop_code,
+    #         'country': country,
+    #         'type': 'modern',
+    #         'ethnicity': ethnicity, # Store ethnicity
+    #         'specific_location': specific_location # Store specific location
+    #     }
+    # master_structured_lookup['final_structured_entries'] = final_structured_entries
+    plain_text_content, table_strings, document_title = read_docx_text(doc_path)
+    pop_code_to_country, pop_code_to_ethnicity, pop_code_to_specific_loc, sample_id_map = general_parse_population_code_to_country(plain_text_content, table_strings)
+
+    final_structured_entries = {}
+    if sample_id_map:
+        for sample_id, pop_code in sample_id_map.items():
+            country = pop_code_to_country.get(pop_code, 'unknown')
+            ethnicity = pop_code_to_ethnicity.get(pop_code, 'unknown')
+            specific_loc = pop_code_to_specific_loc.get(pop_code, 'unknown')
+            final_structured_entries[sample_id] = {
+                'population_code': pop_code,
+                'country': country,
+                'type': 'modern',
+                'ethnicity': ethnicity,
+                'specific_location': specific_loc
+            }
+    else:
+        for pop_code in pop_code_to_country.keys():
+            country = pop_code_to_country.get(pop_code, 'unknown')
+            ethnicity = pop_code_to_ethnicity.get(pop_code, 'unknown')
+            specific_loc = pop_code_to_specific_loc.get(pop_code, 'unknown')
+            final_structured_entries[pop_code] = {
+                'population_code': pop_code,
+                'country': country,
+                'type': 'modern',
+                'ethnicity': ethnicity,
+                'specific_location': specific_loc
+            }
+    if not final_structured_entries:
+      # traditional way of A1YU101
+      sample_id_map, contiguous_ranges_data = parse_sample_id_to_population_code(plain_text_content)
+      pop_code_to_country, pop_code_to_ethnicity, pop_code_to_specific_loc = parse_population_code_to_country(plain_text_content, table_strings)
+      if sample_id_map:
+        for sample_id, pop_code in sample_id_map.items():
+            country = pop_code_to_country.get(pop_code, 'unknown')
+            ethnicity = pop_code_to_ethnicity.get(pop_code, 'unknown')
+            specific_loc = pop_code_to_specific_loc.get(pop_code, 'unknown')
+            final_structured_entries[sample_id] = {
+                'population_code': pop_code,
+                'country': country,
+                'type': 'modern',
+                'ethnicity': ethnicity,
+                'specific_location': specific_loc
+            }
+      else:
+          for pop_code in pop_code_to_country.keys():
+              country = pop_code_to_country.get(pop_code, 'unknown')
+              ethnicity = pop_code_to_ethnicity.get(pop_code, 'unknown')
+              specific_loc = pop_code_to_specific_loc.get(pop_code, 'unknown')
+              final_structured_entries[pop_code] = {
+                  'population_code': pop_code,
+                  'country': country,
+                  'type': 'modern',
+                  'ethnicity': ethnicity,
+                  'specific_location': specific_loc
+              }
+    
+    master_lookup = {
+        'document_title': document_title,
+        'pop_code_to_country': pop_code_to_country,
+        'pop_code_to_ethnicity': pop_code_to_ethnicity,
+        'pop_code_to_specific_loc': pop_code_to_specific_loc,
+        'sample_id_map': sample_id_map,
+        'final_structured_entries': final_structured_entries
+    }
+    print(f"Structured lookup built with {len(final_structured_entries)} entries in 'final_structured_entries'.")
+
+    with open(structured_path, 'w') as f:
+        json.dump(master_lookup, f, indent=4)
+    print(f"Structured lookup saved to {structured_path}.")
+
+    print("Step 2: Chunking document for RAG vector index...")
+    # replace the chunk here with the all_output from process_inputToken and fallback to this traditional chunk
+    clean_text, clean_table = "", ""
+    if plain_text_content:
+      clean_text = data_preprocess.normalize_for_overlap(plain_text_content)
+    if table_strings:
+      clean_table = data_preprocess.normalize_for_overlap(". ".join(table_strings))
+    all_clean_chunk = clean_text + clean_table
+    document_chunks = chunk_text(all_clean_chunk)
+    print(f"Document chunked into {len(document_chunks)} chunks.")
+    
+    print("Step 3: Generating embeddings for chunks (this might take time and cost API calls)...")
+
+    embedding_model_for_chunks = genai.GenerativeModel('models/text-embedding-004')
+
+    chunk_embeddings = []
+    for i, chunk in enumerate(document_chunks):
+        embedding = get_embedding(chunk, task_type="RETRIEVAL_DOCUMENT")
+        if embedding is not None and embedding.shape[0] > 0:
+            chunk_embeddings.append(embedding)
+        else:
+            print(f"Warning: Failed to get valid embedding for chunk {i}. Skipping.")
+            chunk_embeddings.append(np.zeros(768, dtype='float32'))
+
+    if not chunk_embeddings:
+        raise ValueError("No valid embeddings generated. Check get_embedding function and API.")
+
+    embedding_dimension = chunk_embeddings[0].shape[0]
+    index = faiss.IndexFlatL2(embedding_dimension)
+    index.add(np.array(chunk_embeddings))
+
+    faiss.write_index(index, index_path)
+    with open(chunks_path, "w") as f:
+        json.dump(document_chunks, f)
+
+    print(f"FAISS index built and saved to {index_path}.")
+    print(f"Document chunks saved to {chunks_path}.")
+    return master_lookup, index, document_chunks, all_clean_chunk
+
+
+def load_rag_assets(index_path="faiss_index.bin", chunks_path="document_chunks.json", structured_path="structured_lookup.json"):
+    """Loads pre-built RAG assets (FAISS index, chunks, structured lookup)."""
+    print("Loading RAG assets...")
+    master_structured_lookup = {}
+    if os.path.exists(structured_path):
+        with open(structured_path, 'r') as f:
+            master_structured_lookup = json.load(f)
+        print("Structured lookup loaded.")
+    else:
+        print("Structured lookup file not found. Rebuilding is likely needed.")
+
+    index = None
+    chunks = []
+    if os.path.exists(index_path) and os.path.exists(chunks_path):
+        try:
+            index = faiss.read_index(index_path)
+            with open(chunks_path, "r") as f:
+                chunks = json.load(f)
+            print("FAISS index and chunks loaded.")
+        except Exception as e:
+            print(f"Error loading FAISS index or chunks: {e}. Will rebuild.")
+            index = None
+            chunks = []
+    else:
+        print("FAISS index or chunks files not found.")
+
+    return master_structured_lookup, index, chunks
+# Helper function for query_document_info
+def exactInContext(text, keyword):
+# try keyword_prfix
+  # code_pattern = re.compile(r'([A-Z0-9]+?)(\d+)$', re.IGNORECASE)
+  # # Attempt to parse the keyword into its prefix and numerical part using re.search
+  # keyword_match = code_pattern.search(keyword)
+  # keyword_prefix = None
+  # keyword_num = None
+  # if keyword_match:
+  #     keyword_prefix = keyword_match.group(1).lower()
+  #     keyword_num = int(keyword_match.group(2))
+  text = text.lower()
+  idx = text.find(keyword.lower())
+  if idx == -1:
+    # if keyword_prefix:
+    #   idx = text.find(keyword_prefix)
+    # if idx == -1:
+    #   return False
+    return False
+  return True
+def chooseContextLLM(contexts, kw):
+  # if kw in context
+  for con in contexts:
+    context = contexts[con]
+    if context:
+      if exactInContext(context, kw):
+        return con, context    
+  #if cannot find anything related to kw in context, return all output
+  if contexts["all_output"]:
+    return "all_output", contexts["all_output"]
+  else:
+    # if all_output not exist
+    # look of chunk and still not exist return document chunk
+    if contexts["chunk"]: return "chunk", contexts["chunk"]
+    elif contexts["document_chunk"]:  return "document_chunk", contexts["document_chunk"]
+    else: return None, None  
+def clean_llm_output(llm_response_text, output_format_str):
+    results = []
+    lines = llm_response_text.strip().split('\n')
+    output_country, output_type, output_ethnicity, output_specific_location = [],[],[],[]
+    for line in lines:
+        extracted_country, extracted_type, extracted_ethnicity, extracted_specific_location = "unknown", "unknown", "unknown", "unknown"
+        line = line.strip()
+        if output_format_str == "ethnicity, specific_location/unknown": # Targeted RAG output
+            parsed_output = re.search(r'^\s*([^,]+?),\s*(.+?)\s*$', llm_response_text)
+            if parsed_output:
+                extracted_ethnicity = parsed_output.group(1).strip()
+                extracted_specific_location = parsed_output.group(2).strip()
+            else:
+                print("  DEBUG: LLM did not follow expected 2-field format for targeted RAG. Defaulting to unknown for ethnicity/specific_location.")
+                extracted_ethnicity = 'unknown'
+                extracted_specific_location = 'unknown'
+        elif output_format_str == "modern/ancient/unknown, ethnicity, specific_location/unknown":
+          parsed_output = re.search(r'^\s*([^,]+?),\s*([^,]+?),\s*(.+?)\s*$', llm_response_text)
+          if parsed_output:
+              extracted_type = parsed_output.group(1).strip()
+              extracted_ethnicity = parsed_output.group(2).strip()
+              extracted_specific_location = parsed_output.group(3).strip()
+          else:
+              # Fallback: check if only 2 fields
+              parsed_output_2_fields = re.search(r'^\s*([^,]+?),\s*([^,]+?)\s*$', llm_response_text)
+              if parsed_output_2_fields:
+                  extracted_type = parsed_output_2_fields.group(1).strip()
+                  extracted_ethnicity = parsed_output_2_fields.group(2).strip()
+                  extracted_specific_location = 'unknown'
+              else:
+                  # even simpler fallback: 1 field only
+                  parsed_output_1_field = re.search(r'^\s*([^,]+?)\s*$', llm_response_text)
+                  if parsed_output_1_field:
+                      extracted_type = parsed_output_1_field.group(1).strip()
+                      extracted_ethnicity = 'unknown'
+                      extracted_specific_location = 'unknown'
+                  else:
+                      print("  DEBUG: LLM did not follow any expected simplified format. Attempting verbose parsing fallback.")
+                      type_match_fallback = re.search(r'Type:\s*([A-Za-z\s-]+)', llm_response_text)
+                      extracted_type = type_match_fallback.group(1).strip() if type_match_fallback else 'unknown'
+                      extracted_ethnicity = 'unknown'
+                      extracted_specific_location = 'unknown'
+        else:
+          parsed_output = re.search(r'^\s*([^,]+?),\s*([^,]+?),\s*([^,]+?),\s*(.+?)\s*$', line)
+          if parsed_output:
+              extracted_country = parsed_output.group(1).strip()
+              extracted_type = parsed_output.group(2).strip()
+              extracted_ethnicity = parsed_output.group(3).strip()
+              extracted_specific_location = parsed_output.group(4).strip()
+          else:
+              print(f"  DEBUG: Line did not follow expected 4-field format: {line}")
+              parsed_output_2_fields = re.search(r'^\s*([^,]+?),\s*([^,]+?)\s*$', line)
+              if parsed_output_2_fields:
+                  extracted_country = parsed_output_2_fields.group(1).strip()
+                  extracted_type = parsed_output_2_fields.group(2).strip()
+                  extracted_ethnicity = 'unknown'
+                  extracted_specific_location = 'unknown'
+              else:
+                  print(f"  DEBUG: Fallback to verbose-style parsing: {line}")
+                  country_match_fallback = re.search(r'Country:\s*([A-Za-z\s-]+)', line)
+                  type_match_fallback = re.search(r'Type:\s*([A-Za-z\s-]+)', line)
+                  extracted_country = country_match_fallback.group(1).strip() if country_match_fallback else 'unknown'
+                  extracted_type = type_match_fallback.group(1).strip() if type_match_fallback else 'unknown'
+                  extracted_ethnicity = 'unknown'
+                  extracted_specific_location = 'unknown'
+
+        results.append({
+            "country": extracted_country,
+            "type": extracted_type,
+            "ethnicity": extracted_ethnicity,
+            "specific_location": extracted_specific_location
+            #"country_explain":extracted_country_explain,
+            #"type_explain": extracted_type_explain
+        })
+    # if more than 2 results
+    if output_format_str == "ethnicity, specific_location/unknown":
+      for result in results:
+        if result["ethnicity"] not in output_ethnicity:
+          output_ethnicity.append(result["ethnicity"])
+        if result["specific_location"] not in output_specific_location:  
+          output_specific_location.append(result["specific_location"])
+      return " or ".join(output_ethnicity), " or ".join(output_specific_location)     
+    elif output_format_str == "modern/ancient/unknown, ethnicity, specific_location/unknown":
+      for result in results:
+        if result["type"] not in output_type:
+          output_type.append(result["type"])
+        if result["ethnicity"] not in output_ethnicity:
+          output_ethnicity.append(result["ethnicity"])
+        if result["specific_location"] not in output_specific_location:  
+          output_specific_location.append(result["specific_location"])
+
+      return " or ".join(output_type)," or ".join(output_ethnicity), " or ".join(output_specific_location)    
+    else:
+      for result in results:
+        if result["country"] not in output_country:
+          output_country.append(result["country"])
+        if result["type"] not in output_type:
+          output_type.append(result["type"])
+        if result["ethnicity"] not in output_ethnicity:
+          output_ethnicity.append(result["ethnicity"])
+        if result["specific_location"] not in output_specific_location:  
+          output_specific_location.append(result["specific_location"])
+      return " or ".join(output_country)," or ".join(output_type)," or ".join(output_ethnicity), " or ".join(output_specific_location)           
+
+# def parse_multi_sample_llm_output(raw_response: str, output_format_str):
+#     """
+#     Parse LLM output with possibly multiple metadata lines + shared explanations.
+#     """
+#     lines = [line.strip() for line in raw_response.strip().splitlines() if line.strip()]
+#     metadata_list = []
+#     explanation_lines = []
+#     if output_format_str == "country_name, modern/ancient/unknown":
+#         parts = [x.strip() for x in lines[0].split(",")]
+#         if len(parts)==2:
+#           metadata_list.append({
+#               "country": parts[0],
+#               "sample_type": parts[1]#,
+#               #"ethnicity": parts[2],
+#               #"location": parts[3]
+#           })
+#         if 1<len(lines):
+#           line = lines[1]
+#           if "\n" in line:  line = line.split("\n")
+#           if ". " in line: line = line.split(". ")
+#           if isinstance(line,str): line = [line]
+#           explanation_lines += line
+#     elif output_format_str == "modern/ancient/unknown":
+#       metadata_list.append({
+#           "country": "unknown",
+#           "sample_type": lines[0]#,
+#           #"ethnicity": parts[2],
+#           #"location": parts[3]
+#       })
+#       explanation_lines.append(lines[1])
+
+#     # Assign explanations (optional) to each sample — same explanation reused
+#     for md in metadata_list:
+#         md["country_explanation"] = None
+#         md["sample_type_explanation"] = None
+
+#         if md["country"].lower() != "unknown" and len(explanation_lines) >= 1:
+#             md["country_explanation"] = explanation_lines[0]
+
+#         if md["sample_type"].lower() != "unknown":
+#             if len(explanation_lines) >= 2:
+#                 md["sample_type_explanation"] = explanation_lines[1]
+#             elif len(explanation_lines) == 1 and md["country"].lower() == "unknown":
+#                 md["sample_type_explanation"] = explanation_lines[0]
+#             elif len(explanation_lines) == 1:
+#                 md["sample_type_explanation"] = explanation_lines[0]
+#     return metadata_list
+
+def parse_multi_sample_llm_output(raw_response: str, output_format_str):
+    """
+    Parse LLM output with possibly multiple metadata lines + shared explanations.
+    """
+    metadata_list = {}
+    explanation_lines = []
+    output_answers = raw_response.split("\n")[0].split(", ")
+    explanation_lines =  [x for x in raw_response.split("\n")[1:] if x.strip()]
+    print("raw explanation line which split by new line: ", explanation_lines)
+    if len(explanation_lines) == 1:
+        if len(explanation_lines[0].split(". ")) > len(explanation_lines):
+          explanation_lines =  [x for x in explanation_lines[0].split(". ") if x.strip()]
+          print("explain line split by dot: ", explanation_lines)  
+    output_formats = output_format_str.split(", ")
+    explain = ""
+    # assign output format to its output answer and explanation
+    if output_format_str:
+      outputs = output_format_str.split(", ")
+      for o in range(len(outputs)):
+        output = outputs[o]
+        metadata_list[output] = {"answer":"",
+                                  output+"_explanation":""}   
+        # assign output answers
+        if o < len(output_answers):
+          # check if output_format unexpectedly in the answer such as:  
+          #country_name: Europe, modern/ancient: modern
+          try: 
+            if ": " in output_answers[o]:
+              output_answers[o] = output_answers[o].split(": ")[1]
+          except:
+            pass    
+          # Europe, modern  
+          metadata_list[output]["answer"] = output_answers[o]
+          if "unknown" in metadata_list[output]["answer"].lower():
+            metadata_list[output]["answer"] = "unknown"
+        else:
+          metadata_list[output]["answer"] = "unknown"
+        # assign explanations
+        if metadata_list[output]["answer"] != "unknown":
+          # if explanation_lines:
+          #   explain = explanation_lines.pop(0)
+          # else:
+          #   explain = ". ".join(explanation_lines)    
+          explain = ". ".join(explanation_lines)  
+          metadata_list[output][output+"_explanation"] = explain
+        else:  
+          metadata_list[output][output+"_explanation"] = "unknown"  
+    return metadata_list
+
+def merge_metadata_outputs(metadata_list):
+    """
+    Merge a list of metadata dicts into one, combining differing values with 'or'.
+    Assumes all dicts have the same keys.
+    """
+    if not metadata_list:
+        return {}
+
+    merged = {}
+    keys = metadata_list[0].keys()
+
+    for key in keys:
+        values = [md[key] for md in metadata_list if key in md]
+        unique_values = list(dict.fromkeys(values))  # preserve order, remove dupes
+        if "unknown" in unique_values:
+          unique_values.pop(unique_values.index("unknown"))
+        if len(unique_values) == 1:
+            merged[key] = unique_values[0]
+        else:
+            merged[key] = " or ".join(unique_values)
+
+    return merged
+
+
+def query_document_info(query_word, alternative_query_word, metadata, master_structured_lookup, faiss_index, document_chunks, llm_api_function, chunk=None, all_output=None, model_ai=None):
+    """
+    Queries the document using a hybrid approach:
+    1. Local structured lookup (fast, cheap, accurate for known patterns).
+    2. RAG with semantic search and LLM (general, flexible, cost-optimized).
+    """
+    print("inside the model.query_doc_info")
+    if model_ai:
+      if model_ai == "gemini-1.5-flash-latest":
+        genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
+        PRICE_PER_1K_INPUT_LLM = 0.000075  # $0.075 per 1M tokens
+        PRICE_PER_1K_OUTPUT_LLM = 0.0003   # $0.30 per 1M tokens
+        PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
+        global_llm_model_for_counting_tokens = genai.GenerativeModel("gemini-1.5-flash-latest")#('gemini-1.5-flash-latest')
+    else:
+      genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP"))    
+      # Gemini 2.5 Flash-Lite pricing per 1,000 tokens
+      PRICE_PER_1K_INPUT_LLM = 0.00010      # $0.10 per 1M input tokens
+      PRICE_PER_1K_OUTPUT_LLM = 0.00040     # $0.40 per 1M output tokens
+
+      # Embedding-001 pricing per 1,000 input tokens
+      PRICE_PER_1K_EMBEDDING_INPUT = 0.00015  # $0.15 per 1M input tokens
+      global_llm_model_for_counting_tokens = genai.GenerativeModel("gemini-2.5-flash-lite")#('gemini-1.5-flash-latest')
+    
+    if metadata:
+      extracted_country, extracted_specific_location, extracted_ethnicity, extracted_type = metadata["country"], metadata["specific_location"], metadata["ethnicity"], metadata["sample_type"]
+      extracted_col_date, extracted_iso, extracted_title, extracted_features = metadata["collection_date"], metadata["isolate"], metadata["title"], metadata["all_features"]
+    else:
+      extracted_country, extracted_specific_location, extracted_ethnicity, extracted_type = "unknown", "unknown", "unknown", "unknown"
+      extracted_col_date, extracted_iso, extracted_title = "unknown", "unknown", "unknown"
+    # --- NEW: Pre-process alternative_query_word to remove '.X' suffix if present ---
+    if alternative_query_word:
+        alternative_query_word_cleaned = alternative_query_word.split('.')[0]
+    else:
+        alternative_query_word_cleaned = alternative_query_word
+    country_explanation, sample_type_explanation = None, None
+
+    # Use the consolidated final_structured_entries for direct lookup
+    # final_structured_entries = master_structured_lookup.get('final_structured_entries', {})
+    # document_title = master_structured_lookup.get('document_title', 'Unknown Document Title') # Retrieve document title
+
+    # Default values for all extracted fields. These will be updated.
+    method_used = 'unknown' # Will be updated based on the method that yields a result
+    population_code_from_sl = 'unknown' # To pass to RAG prompt if available
+    total_query_cost = 0
+    # Attempt 1: Try primary query_word (e.g., isolate name) with structured lookup
+    # try: 
+    #     print("try attempt 1 in model query")
+    #     structured_info = final_structured_entries.get(query_word.upper())
+    #     if structured_info:
+    #         if extracted_country == 'unknown':
+    #           extracted_country = structured_info['country']
+    #         if extracted_type == 'unknown':
+    #           extracted_type = structured_info['type']
+              
+    #         # if extracted_ethnicity == 'unknown':
+    #         #   extracted_ethnicity = structured_info.get('ethnicity', 'unknown') # Get ethnicity from structured lookup
+    #         # if extracted_specific_location == 'unknown':
+    #         #   extracted_specific_location = structured_info.get('specific_location', 'unknown') # Get specific_location from structured lookup
+    #         population_code_from_sl = structured_info['population_code']
+    #         method_used = "structured_lookup_direct"
+    #         print(f"'{query_word}' found in structured lookup (direct match).")
+    # except:
+    #     print("pass attempt 1 in model query")
+    #     pass 
+    # # Attempt 2: Try primary query_word with heuristic range lookup if direct fails (only if not already resolved)
+    # try:
+    #     print("try attempt 2 in model query")
+    #     if method_used == 'unknown':
+    #         query_prefix, query_num_str = _parse_individual_code_parts(query_word)
+    #         if query_prefix is not None and query_num_str is not None:
+    #             try: query_num = int(query_num_str)
+    #             except ValueError: query_num = None
+    #             if query_num is not None:
+    #                 query_prefix_upper = query_prefix.upper()
+    #                 contiguous_ranges = master_structured_lookup.get('contiguous_ranges', defaultdict(list))
+    #                 pop_code_to_country = master_structured_lookup.get('pop_code_to_country', {})
+    #                 pop_code_to_ethnicity = master_structured_lookup.get('pop_code_to_ethnicity', {})
+    #                 pop_code_to_specific_loc = master_structured_lookup.get('pop_code_to_specific_loc', {})
+    
+    #                 if query_prefix_upper in contiguous_ranges:
+    #                     for start_num, end_num, pop_code_for_range in contiguous_ranges[query_prefix_upper]:
+    #                         if start_num <= query_num <= end_num:
+    #                             country_from_heuristic = pop_code_to_country.get(pop_code_for_range, 'unknown')
+    #                             if country_from_heuristic != 'unknown':
+    #                                 if extracted_country == 'unknown':
+    #                                   extracted_country = country_from_heuristic
+    #                                 if extracted_type == 'unknown':
+    #                                   extracted_type = 'modern'
+    #                                 # if extracted_ethnicity == 'unknown':
+    #                                 #   extracted_ethnicity = pop_code_to_ethnicity.get(pop_code_for_range, 'unknown')
+    #                                 # if extracted_specific_location == 'unknown':
+    #                                 #   extracted_specific_location = pop_code_to_specific_loc.get(pop_code_for_range, 'unknown')
+    #                                 population_code_from_sl = pop_code_for_range
+    #                                 method_used = "structured_lookup_heuristic_range_match"
+    #                                 print(f"'{query_word}' not direct. Heuristic: Falls within range {query_prefix_upper}{start_num}-{query_prefix_upper}{end_num}.")
+    #                                 break
+    #                             else:
+    #                                 print(f"'{query_word}' heuristic match found, but country unknown. Will fall to RAG below.")
+    # except:
+    #     print("pass attempt 2 in model query")                              
+    #     pass
+    # # Attempt 3: If primary query_word failed all structured lookups, try alternative_query_word (cleaned)
+    # try:
+    #     print("try attempt 3 in model query")
+    #     if method_used == 'unknown' and alternative_query_word_cleaned and alternative_query_word_cleaned != query_word:
+    #         print(f"'{query_word}' not found in structured (or heuristic). Trying alternative '{alternative_query_word_cleaned}'.")
+    
+    #         # Try direct lookup for alternative word
+    #         structured_info_alt = final_structured_entries.get(alternative_query_word_cleaned.upper())
+    #         if structured_info_alt:
+    #           if extracted_country == 'unknown':
+    #             extracted_country = structured_info_alt['country']
+    #           if extracted_type == 'unknown':
+    #             extracted_type = structured_info_alt['type']
+    #           # if extracted_ethnicity == 'unknown':
+    #           #   extracted_ethnicity = structured_info_alt.get('ethnicity', 'unknown')
+    #           # if extracted_specific_location == 'unknown':
+    #           #   extracted_specific_location = structured_info_alt.get('specific_location', 'unknown')
+    #           population_code_from_sl = structured_info_alt['population_code']
+    #           method_used = "structured_lookup_alt_direct"
+    #           print(f"Alternative '{alternative_query_word_cleaned}' found in structured lookup (direct match).")
+    #         else:
+    #             # Try heuristic lookup for alternative word
+    #             alt_prefix, alt_num_str = _parse_individual_code_parts(alternative_query_word_cleaned)
+    #             if alt_prefix is not None and alt_num_str is not None:
+    #                 try: alt_num = int(alt_num_str)
+    #                 except ValueError: alt_num = None
+    #                 if alt_num is not None:
+    #                     alt_prefix_upper = alt_prefix.upper()
+    #                     contiguous_ranges = master_structured_lookup.get('contiguous_ranges', defaultdict(list))
+    #                     pop_code_to_country = master_structured_lookup.get('pop_code_to_country', {})
+    #                     pop_code_to_ethnicity = master_structured_lookup.get('pop_code_to_ethnicity', {})
+    #                     pop_code_to_specific_loc = master_structured_lookup.get('pop_code_to_specific_loc', {})
+    #                     if alt_prefix_upper in contiguous_ranges:
+    #                         for start_num, end_num, pop_code_for_range in contiguous_ranges[alt_prefix_upper]:
+    #                             if start_num <= alt_num <= end_num:
+    #                                 country_from_heuristic_alt = pop_code_to_country.get(pop_code_for_range, 'unknown')
+    #                                 if country_from_heuristic_alt != 'unknown':
+    #                                   if extracted_country == 'unknown':
+    #                                     extracted_country = country_from_heuristic_alt
+    #                                   if extracted_type == 'unknown':
+    #                                     extracted_type = 'modern'
+    #                                   # if extracted_ethnicity == 'unknown':
+    #                                   #   extracted_ethnicity = pop_code_to_ethnicity.get(pop_code_for_range, 'unknown')
+    #                                   # if extracted_specific_location == 'unknown':
+    #                                   #   extracted_specific_location = pop_code_to_specific_loc.get(pop_code_for_range, 'unknown')
+    #                                   population_code_from_sl = pop_code_for_range
+    #                                   method_used = "structured_lookup_alt_heuristic_range_match"
+    #                                   break
+    #                                 else:
+    #                                     print(f"Alternative '{alternative_query_word_cleaned}' heuristic match found, but country unknown. Will fall to RAG below.")
+    # except:
+    #     print("pass attempt 3 in model query")
+    #     pass
+    # use the context_for_llm to detect present_ancient before using llm model
+    # retrieved_chunks_text = []
+    # if document_chunks:
+    #   for idx in range(len(document_chunks)):
+    #           retrieved_chunks_text.append(document_chunks[idx])
+    # context_for_llm = ""
+    # all_context = "\n".join(retrieved_chunks_text) # 
+    # listOfcontexts = {"chunk": chunk, 
+    #         "all_output": all_output,
+    #         "document_chunk": all_context}
+    # label, context_for_llm = chooseContextLLM(listOfcontexts, query_word) 
+    # if not context_for_llm:
+    #   label, context_for_llm = chooseContextLLM(listOfcontexts, alternative_query_word_cleaned)
+    #   if not context_for_llm:
+    #     context_for_llm = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + extracted_features
+    # if context_for_llm:
+    #   extracted_type, explain = mtdna_classifier.detect_ancient_flag(context_for_llm)
+    #   extracted_type = extracted_type.lower()
+    #   sample_type_explanation = explain
+    # 5. Execute RAG if needed (either full RAG or targeted RAG for missing fields)
+
+    # Determine if a RAG call is necessary
+    # run_rag = (extracted_country == 'unknown' or extracted_type == 'unknown')# or \
+    #            #extracted_ethnicity == 'unknown' or extracted_specific_location == 'unknown')
+    run_rag = True
+    if run_rag:
+        print("try run rag")
+        # Determine the phrase for LLM query
+        rag_query_phrase = f"'{query_word}'"
+        if alternative_query_word_cleaned and alternative_query_word_cleaned != query_word:
+            rag_query_phrase += f" or its alternative word '{alternative_query_word_cleaned}'"
+
+        # Construct a more specific semantic query phrase for embedding if structured info is available
+        semantic_query_for_embedding = rag_query_phrase # Default
+        # if extracted_country != 'unknown': # If country is known from structured lookup (for targeted RAG)
+        #     if population_code_from_sl != 'unknown':
+        #         semantic_query_for_embedding = f"ethnicity and specific location for {query_word} population {population_code_from_sl} in {extracted_country}"
+        #     else: # If pop_code not found in structured, still use country hint
+        #         semantic_query_for_embedding = f"ethnicity and specific location for {query_word} in {extracted_country}"
+        # print(f"  DEBUG: Semantic query for embedding: '{semantic_query_for_embedding}'")
+
+
+        # Determine fields to ask LLM for and output format based on what's known/needed
+        prompt_instruction_prefix = ""
+        output_format_str = ""
+
+        # Determine if it's a full RAG or targeted RAG scenario based on what's already extracted
+        is_full_rag_scenario = True#(extracted_country == 'unknown')
+
+        if is_full_rag_scenario: # Full RAG scenario
+            output_format_str = "country_name, modern/ancient/unknown"#, ethnicity, specific_location/unknown"
+            method_used = "rag_llm"
+            print(f"Proceeding to FULL RAG for {rag_query_phrase}.")
+        # else: # Targeted RAG scenario (country/type already known, need ethnicity/specific_location)
+        #     if extracted_type == "unknown":
+        #         prompt_instruction_prefix = (
+        #             f"I already know the country is {extracted_country}. "
+        #             f"{f'The population code is {population_code_from_sl}. ' if population_code_from_sl != 'unknown' else ''}"
+        #         )
+        #         #output_format_str = "modern/ancient/unknown, ethnicity, specific_location/unknown"
+        #         output_format_str = "modern/ancient/unknown"
+        #     # else:
+        #     #     prompt_instruction_prefix = (
+        #     #         f"I already know the country is {extracted_country} and the sample type is {extracted_type}. "
+        #     #         f"{f'The population code is {population_code_from_sl}. ' if population_code_from_sl != 'unknown' else ''}"
+        #     #     )
+        #     #     output_format_str = "ethnicity, specific_location/unknown"
+
+        #     method_used = "hybrid_sl_rag"
+        #     print(f"Proceeding to TARGETED RAG for {rag_query_phrase}.")
+
+
+        # Calculate embedding cost for the primary query word
+        current_embedding_cost = 0
+        # try:
+        #     query_embedding_vector = get_embedding(semantic_query_for_embedding, task_type="RETRIEVAL_QUERY")
+        #     query_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(semantic_query_for_embedding).total_tokens
+        #     current_embedding_cost += (query_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
+        #     print(f"  DEBUG: Query embedding tokens (for '{semantic_query_for_embedding}'): {query_embedding_tokens}, cost: ${current_embedding_cost:.6f}")
+
+        #     if alternative_query_word_cleaned and alternative_query_word_cleaned != query_word:
+        #         alt_embedding_vector = get_embedding(alternative_query_word_cleaned, task_type="RETRIEVAL_QUERY")
+        #         alt_embedding_tokens = global_llm_model_for_counting_tokens.count_tokens(alternative_query_word_cleaned).total_tokens
+        #         current_embedding_cost += (alt_embedding_tokens / 1000) * PRICE_PER_1K_EMBEDDING_INPUT
+        #         print(f"  DEBUG: Alternative query ('{alternative_query_word_cleaned}') embedding tokens: {alt_embedding_tokens}, cost: ${current_embedding_cost:.6f}")
+
+        # except Exception as e:
+        #     print(f"Error getting query embedding for RAG: {e}")
+        #     return extracted_country, extracted_type, "embedding_failed", extracted_ethnicity, extracted_specific_location, total_query_cost
+
+        # if query_embedding_vector is None or query_embedding_vector.shape[0] == 0:
+        #     return extracted_country, extracted_type, "embedding_failed", extracted_ethnicity, extracted_specific_location, total_query_cost
+
+        # D, I = faiss_index.search(np.array([query_embedding_vector]), 4)
+
+        # retrieved_chunks_text = []
+        # for idx in I[0]:
+        #     if 0 <= idx < len(document_chunks):
+        #         retrieved_chunks_text.append(document_chunks[idx])
+
+        # context_for_llm = ""
+        
+        # all_context = "\n".join(retrieved_chunks_text) # 
+        print("direct to llm")
+        listOfcontexts = {"chunk": chunk, 
+            "all_output": all_output,
+            "document_chunk": chunk}
+        label, context_for_llm = chooseContextLLM(listOfcontexts, query_word) 
+        if not context_for_llm:
+          label, context_for_llm = chooseContextLLM(listOfcontexts, alternative_query_word_cleaned)
+          if not context_for_llm:
+            context_for_llm = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + extracted_features
+        #print("context for llm: ", label)    
+        # prompt_for_llm = (
+        #     f"{prompt_instruction_prefix}"
+        #     f"Given the following text snippets, analyze the entity/concept {rag_query_phrase} or the mitochondrial DNA sample in general if these specific identifiers are not explicitly found. "
+        #     f"Identify its primary associated country/geographic location. "
+        #     f"Also, determine if the genetic sample or individual mentioned is from a 'modern' (present-day living individual) "
+        #     f"or 'ancient' (e.g., prehistoric remains, archaeological sample) source. "
+        #     f"If the text does not mention whether the sample is ancient or modern, assume the sample is modern unless otherwise explicitly described as ancient or archaeological. "
+        #     f"Additionally, extract its ethnicity and a more specific location (city/district level) within the predicted country. "
+        #     f"If any information is not explicitly present in the provided text snippets, state 'unknown' for that specific piece of information. "
+        #     f"Provide only the country, sample type, ethnicity, and specific location, do not add extra explanations.\n\n"
+        #     f"Text Snippets:\n{context_for_llm}\n\n"
+        #     f"Output Format: {output_format_str}"
+        # )
+        if len(context_for_llm) > 1000*1000:
+          context_for_llm = context_for_llm[:900000]
+
+        # fix the prompt better:
+        # firstly clarify more by saying which type of organism, prioritize homo sapiens
+        features = metadata["all_features"]
+        organism = "general"
+        if features != "unknown":
+          if "organism" in features:
+            try:
+              organism = features.split("organism: ")[1].split("\n")[0]  
+            except:
+              organism = features.replace("\n","; ")  
+        explain_list = "country or sample type (modern/ancient)" #or ethnicity or specific location (province/city)"    
+        
+#         prompt_for_llm = (
+#     f"{prompt_instruction_prefix}"
+#     f"Given the following text snippets, analyze the entity/concept {rag_query_phrase} or the mitochondrial DNA sample in general if these specific identifiers are not explicitly found. "
+#     f"Identify its primary associated country/geographic location. "
+#     f"Also, determine if the genetic sample or individual mentioned is from a 'modern' (present-day living individual) "
+#     f"or 'ancient' (e.g., prehistoric remains, archaeological sample) source. "
+#     f"If the text does not mention whether the sample is ancient or modern, assume the sample is modern unless otherwise explicitly described as ancient or archaeological. "
+#     f"Provide only {output_format_str}. "
+#     f"If any information is not explicitly present in the provided text snippets, state 'unknown' for that specific piece of information. "
+#     f"If the country or sample type (modern/ancient) is not 'unknown', write 1 sentence after the output explaining how you inferred it from the text (one sentence for each)."
+#     f"\n\nText Snippets:\n{context_for_llm}\n\n"
+#     f"Output Format: {output_format_str}"
+# )
+        
+#         prompt_for_llm = (
+#     f"{prompt_instruction_prefix}"
+#     f"Given the following text snippets, analyze the entity/concept {rag_query_phrase} or the mitochondrial DNA sample in {organism} if these specific identifiers are not explicitly found. "
+#     f"Identify its primary associated country/geographic location. "
+#     f"Also, determine if the genetic sample or individual mentioned is from a 'modern' (present-day living individual) "
+#     f"or 'ancient' (e.g., prehistoric remains, archaeological sample) source. "
+#     f"If the text does not mention whether the sample is ancient or modern, assume the sample is modern unless otherwise explicitly described as ancient or archaeological. "
+#     f"Provide only {output_format_str}. "
+#     f"If any information is not explicitly present in the provided text snippets, state 'unknown' for that specific piece of information. "
+#     f"If the {explain_list} is not 'unknown', write 1 sentence after the output explaining how you inferred it from the text (one sentence for each)."
+#     f"\n\nText Snippets:\n{context_for_llm}\n\n"
+#     f"Output Format: {output_format_str}"
+# )
+#         prompt_for_llm = (
+#     f"{prompt_instruction_prefix}"
+#     f"Given the following text snippets, analyze the entity/concept {rag_query_phrase} "
+#     f"or the mitochondrial DNA sample in {organism} if these identifiers are not explicitly found. "
+#     f"Identify its **primary associated geographic location**, preferring the most specific available: "
+#     f"first try to determine the exact country; if no country is explicitly mentioned, then provide "
+#     f"the next most specific region, continent, island, or other clear geographic area mentioned. "
+#     f"If no geographic clues at all are present, state 'unknown' for location. "
+#     f"Also, determine if the genetic sample is from a 'modern' (present-day living individual) "
+#     f"or 'ancient' (prehistoric/archaeological) source. "
+#     f"If the text does not specify ancient or archaeological context, assume 'modern'. "
+#     f"Provide only {output_format_str}. "
+#     f"If any information is not explicitly present, use the fallback rules above before defaulting to 'unknown'. "
+#     f"For each non-'unknown' field in {explain_list}, write one sentence explaining how it was inferred from the text (one sentence for each)."
+#     f"\n\nText Snippets:\n{context_for_llm}\n\n"
+#     f"Output Format: {output_format_str}"
+# )
+        prompt_for_llm = (
+    f"{prompt_instruction_prefix}"
+    f"Given the following text snippets, analyze the entity/concept {rag_query_phrase} "
+    f"or the mitochondrial DNA sample in {organism} if these identifiers are not explicitly found. "
+    f"Identify its **primary associated geographic location**, preferring the most specific available: "
+    f"first try to determine the exact country; if no country is explicitly mentioned, then provide "
+    f"the next most specific region, continent, island, or other clear geographic area mentioned. "
+    f"If no geographic clues at all are present, state 'unknown' for location. "
+    f"Also, determine if the genetic sample is from a 'modern' (present-day living individual) "
+    f"or 'ancient' (prehistoric/archaeological) source. "
+    f"If the text does not specify ancient or archaeological context, assume 'modern'. "
+    f"Provide only {output_format_str}. "
+    f"If any information is not explicitly present, use the fallback rules above before defaulting to 'unknown'. "
+    f"For each non-'unknown' field in {explain_list}, write one sentence explaining how it was inferred from the text "
+    f"(one sentence for each). "
+    f"Format your answer so that:\n"
+    f"1. The **first line** contains only the {output_format_str} answer.\n"
+    f"2. The **second line onward** contains the explanations based on the order of the non-unknown {output_format_str} answer.\n"
+    f"\nText Snippets:\n{context_for_llm}\n\n"
+    f"Output Format Example:\nBrunei, modern.\n"
+    f"The text explicitly states BRU18 in the context of brunei (borneo), indicating the country and a broader geographic region within that country."
+    f"The study is published in a journal, implying research on living individuals, hence modern."
+)
+
+        if model_ai:
+          print("back up to ", model_ai)
+          llm_response_text, model_instance = call_llm_api(prompt_for_llm, model=model_ai)
+        else:
+          print("still 2.5 flash gemini")
+          llm_response_text, model_instance = call_llm_api(prompt_for_llm)
+        print("\n--- DEBUG INFO FOR RAG ---")
+        print("Retrieved Context Sent to LLM (first 500 chars):")
+        print(context_for_llm[:500] + "..." if len(context_for_llm) > 500 else context_for_llm)
+        print("\nRaw LLM Response:")
+        print(llm_response_text)
+        print("--- END DEBUG INFO ---")
+
+        llm_cost = 0
+        if model_instance:
+            try:
+                input_llm_tokens = global_llm_model_for_counting_tokens.count_tokens(prompt_for_llm).total_tokens
+                output_llm_tokens = global_llm_model_for_counting_tokens.count_tokens(llm_response_text).total_tokens
+                print(f"  DEBUG: LLM Input tokens: {input_llm_tokens}")
+                print(f"  DEBUG: LLM Output tokens: {output_llm_tokens}")
+                llm_cost = (input_llm_tokens / 1000) * PRICE_PER_1K_INPUT_LLM + \
+                           (output_llm_tokens / 1000) * PRICE_PER_1K_OUTPUT_LLM
+                print(f"  DEBUG: Estimated LLM cost: ${llm_cost:.6f}")
+            except Exception as e:
+                print(f"  DEBUG: Error counting LLM tokens: {e}")
+                llm_cost = 0
+
+        total_query_cost += current_embedding_cost + llm_cost
+        print(f"  DEBUG: Total estimated cost for this RAG query: ${total_query_cost:.6f}")
+        # Parse the LLM's response based on the Output Format actually used
+        # if output_format_str == "ethnicity, specific_location/unknown": # Targeted RAG output
+        #     extracted_ethnicity,extracted_specific_location = clean_llm_output(llm_response_text, output_format_str)
+        # elif output_format_str == "modern/ancient/unknown, ethnicity, specific_location/unknown":
+        #     extracted_type, extracted_ethnicity,extracted_specific_location=clean_llm_output(llm_response_text, output_format_str) 
+        # else: # Full RAG output (country, type, ethnicity, specific_location)
+        #     extracted_country,extracted_type, extracted_ethnicity,extracted_specific_location=clean_llm_output(llm_response_text, output_format_str) 
+        metadata_list = parse_multi_sample_llm_output(llm_response_text, output_format_str)
+        # merge_metadata = merge_metadata_outputs(metadata_list)
+        # if output_format_str == "country_name, modern/ancient/unknown":
+        #   extracted_country, extracted_type = merge_metadata["country"], merge_metadata["sample_type"]  
+        #   country_explanation,sample_type_explanation = merge_metadata["country_explanation"], merge_metadata["sample_type_explanation"]
+        # elif output_format_str == "modern/ancient/unknown":
+        #   extracted_type = merge_metadata["sample_type"]  
+        #   sample_type_explanation = merge_metadata["sample_type_explanation"]
+        # for the output_format that is not default
+        if output_format_str == "country_name, modern/ancient/unknown":
+          outputs = output_format_str.split(", ")
+          extracted_country, extracted_type = metadata_list[outputs[0]]["answer"], metadata_list[outputs[1]]["answer"]  
+          country_explanation,sample_type_explanation = metadata_list[outputs[0]][outputs[0]+"_explanation"], metadata_list[outputs[1]][outputs[1]+"_explanation"]
+          # extracted_ethnicity, extracted_specific_location = metadata_list[outputs[2]]["answer"], metadata_list[outputs[3]]["answer"]  
+          # ethnicity_explanation, specific_loc_explanation = metadata_list[outputs[2]][outputs[2]+"_explanation"], metadata_list[outputs[3]][outputs[3]+"_explanation"]
+    # 6. Optional: Second LLM call for specific_location from general knowledge if still unknown
+    # if extracted_specific_location == 'unknown':
+    #     # Check if we have enough info to ask general knowledge LLM
+    #     if extracted_country != 'unknown' and extracted_ethnicity != 'unknown':
+    #         print(f"  DEBUG: Specific location still unknown. Querying general knowledge LLM from '{extracted_ethnicity}' and '{extracted_country}'.")
+
+    #         general_knowledge_prompt = (
+    #             f"Based on general knowledge, what is a highly specific location (city or district) "
+    #             f"associated with the ethnicity '{extracted_ethnicity}' in '{extracted_country}'? "
+    #             f"Consider the context of scientific studies on human genetics, if known. "
+    #             f"If no common specific location is known, state 'unknown'. "
+    #             f"Provide only the city or district name, or 'unknown'."
+    #         )
+
+    #         general_llm_response, general_llm_model_instance = call_llm_api(general_knowledge_prompt, model_name='gemini-1.5-flash-latest')
+
+    #         if general_llm_response and general_llm_response.lower().strip() != 'unknown':
+    #             extracted_specific_location = general_llm_response.strip() + " (predicted from general knowledge)"
+    #             # Add cost of this second LLM call
+    #             if general_llm_model_instance:
+    #                 try:
+    #                     gk_input_tokens = general_llm_model_instance.count_tokens(general_knowledge_prompt).total_tokens
+    #                     gk_output_tokens = general_llm_model_instance.count_tokens(general_llm_response).total_tokens
+    #                     gk_cost = (gk_input_tokens / 1000) * PRICE_PER_1K_INPUT_LLM + \
+    #                               (gk_output_tokens / 1000) * PRICE_PER_1K_OUTPUT_LLM
+    #                     print(f"  DEBUG: General Knowledge LLM cost to predict specific location alone: ${gk_cost:.6f}")
+    #                     total_query_cost += gk_cost # Accumulate cost
+    #                 except Exception as e:
+    #                     print(f"  DEBUG: Error counting GK LLM tokens: {e}")
+    #         else:
+    #             print("  DEBUG: General knowledge LLM returned unknown or empty for specific location.")
+    # # 6. Optional: Second LLM call for ethnicity from general knowledge if still unknown
+    # if extracted_ethnicity == 'unknown':
+    #     # Check if we have enough info to ask general knowledge LLM
+    #     if extracted_country != 'unknown' and extracted_specific_location != 'unknown':
+    #         print(f"  DEBUG: Ethnicity still unknown. Querying general knowledge LLM from '{extracted_specific_location}' and '{extracted_country}'.")
+
+    #         general_knowledge_prompt = (
+    #             f"Based on general knowledge, what is a highly ethnicity (population) "
+    #             f"associated with the specific location '{extracted_specific_location}' in '{extracted_country}'? "
+    #             f"Consider the context of scientific studies on human genetics, if known. "
+    #             f"If no common ethnicity is known, state 'unknown'. "
+    #             f"Provide only the ethnicity or popluation name, or 'unknown'."
+    #         )
+
+    #         general_llm_response, general_llm_model_instance = call_llm_api(general_knowledge_prompt, model_name='gemini-1.5-flash-latest')
+
+    #         if general_llm_response and general_llm_response.lower().strip() != 'unknown':
+    #             extracted_ethnicity = general_llm_response.strip() + " (predicted from general knowledge)"
+    #             # Add cost of this second LLM call
+    #             if general_llm_model_instance:
+    #                 try:
+    #                     gk_input_tokens = general_llm_model_instance.count_tokens(general_knowledge_prompt).total_tokens
+    #                     gk_output_tokens = general_llm_model_instance.count_tokens(general_llm_response).total_tokens
+    #                     gk_cost = (gk_input_tokens / 1000) * PRICE_PER_1K_INPUT_LLM + \
+    #                               (gk_output_tokens / 1000) * PRICE_PER_1K_OUTPUT_LLM
+    #                     print(f"  DEBUG: General Knowledge LLM cost to predict ethnicity alone: ${gk_cost:.6f}")
+    #                     total_query_cost += gk_cost # Accumulate cost
+    #                 except Exception as e:
+    #                     print(f"  DEBUG: Error counting GK LLM tokens: {e}")
+    #         else:
+    #             print("  DEBUG: General knowledge LLM returned unknown or empty for ethnicity.")            
+
+
+    #return extracted_country, extracted_type, method_used, extracted_ethnicity, extracted_specific_location, total_query_cost
+    print(str([extracted_country, extracted_type]))
+    fields = [
+    ("country_name", extracted_country),
+    ("modern/ancient/unknown", extracted_type),
+    # ("specific_location (province/city)", extracted_specific_location),
+    # ("ethnicity", extracted_ethnicity),
+    ]
+
+    for field_name, value in fields:
+      print("this is value:", value.lower())
+      if value.lower() in " ".join(["unknown", "unspecified","could not get response from llm api.", "undefined"]):
+        print("have to do again")
+        output_format_str = field_name
+        print("output format:", output_format_str)
+        general_knowledge_prompt = (
+    f"{prompt_instruction_prefix}"
+    f"Given the following text snippets, analyze the entity/concept {rag_query_phrase} "
+    f"or the mitochondrial DNA sample in {organism} if these identifiers are not explicitly found. "
+    f"Identify its **primary associated geographic location**, preferring the most specific available: "
+    f"first try to determine the exact country; if no country is explicitly mentioned, then provide "
+    f"the next most specific region, continent, island, or other clear geographic area mentioned. "
+    f"If no geographic clues at all are present, state 'unknown' for location. "
+    f"Also, determine if the genetic sample is from a 'modern' (present-day living individual) "
+    f"or 'ancient' (prehistoric/archaeological) source. "
+    f"If the text does not specify ancient or archaeological context, assume 'modern'. "
+    f"Provide only {output_format_str}. "
+    f"If any information is not explicitly present, use the fallback rules above before defaulting to 'unknown'. "
+    f"For each non-'unknown' field in {explain_list}, write one sentence explaining how it was inferred from the text "
+    f"(one sentence for each). "
+    f"Format your answer so that:\n"
+    f"1. The **first line** contains only the {output_format_str} answer.\n"
+    f"2. The **second line onward** contains the explanations based on the order of the non-unknown {output_format_str} answer.\n"
+    f"\nText Snippets:\n{context_for_llm}"#\n\n"
+    # f"Output Format Example:\nBrunei, modern, unknown, Borneo.\n"
+    # f"The text explicitly states BRU18 in the context of brunei (borneo), indicating the country and a broader geographic region within that country."
+    # f"The study is published in a journal, implying research on living individuals, hence modern."
+    # f"The text mentions 183 from sabah brunei and kalimantan in borneo in the context of analyzing mtDNA sequences from island Southeast Asia."
+)
+        # general_knowledge_prompt = (
+        #     f"Based on general knowledge, what is a likely {output_format_str} "
+        #     f"associated with the following text snippet: {context_llm_text}\n\n? "
+        #     f"Consider the context of scientific studies on human genetics, if known. "
+        #     f"If no common {output_format_str} is known, state 'unknown'. "
+        #     f"Format your answer so that:\n"
+        #     f"1. The **first line** contains only the {output_format_str} answer.\n"
+        #     f"2. The **second line onward** contains the explanations based on the order of the non-unknown {output_format_str} answer.\n"
+        # )
+        print("len of prompt:", len(general_knowledge_prompt))
+
+        if model_ai:
+          print("back up to ", model_ai)
+          llm_response_text, model_instance = call_llm_api(general_knowledge_prompt, model=model_ai)
+        else:
+          print("still 2.5 flash gemini")
+          llm_response_text, model_instance = call_llm_api(general_knowledge_prompt)  
+        print("\n--- DEBUG INFO FOR RAG ---")
+        print("Retrieved Context Sent to LLM (first 500 chars):")
+        print(context_for_llm[:500] + "..." if len(context_for_llm) > 500 else context_for_llm)
+        print("\nRaw LLM Response:")
+        print(llm_response_text)
+        print("--- END DEBUG INFO ---")
+
+        llm_cost = 0
+        if model_instance:
+            try:
+                input_llm_tokens = global_llm_model_for_counting_tokens.count_tokens(prompt_for_llm).total_tokens
+                output_llm_tokens = global_llm_model_for_counting_tokens.count_tokens(llm_response_text).total_tokens
+                print(f"  DEBUG: LLM Input tokens: {input_llm_tokens}")
+                print(f"  DEBUG: LLM Output tokens: {output_llm_tokens}")
+                llm_cost = (input_llm_tokens / 1000) * PRICE_PER_1K_INPUT_LLM + \
+                           (output_llm_tokens / 1000) * PRICE_PER_1K_OUTPUT_LLM
+                print(f"  DEBUG: Estimated LLM cost: ${llm_cost:.6f}")
+            except Exception as e:
+                print(f"  DEBUG: Error counting LLM tokens: {e}")
+                llm_cost = 0
+
+        total_query_cost += current_embedding_cost + llm_cost
+        print("total query cost in again: ", total_query_cost)
+        metadata_list = parse_multi_sample_llm_output(llm_response_text, output_format_str)
+
+        if output_format_str == "country_name":
+          outputs = output_format_str.split(", ")
+          extracted_country= metadata_list[outputs[0]]["answer"]  
+          country_explanation = metadata_list[outputs[0]][outputs[0]+"_explanation"]
+        elif output_format_str == "modern/ancient/unknown":
+          outputs = output_format_str.split(", ")
+          extracted_type= metadata_list[outputs[0]]["answer"]  
+          sample_type_explanation = metadata_list[outputs[0]][outputs[0]+"_explanation"] 
+        # elif output_format_str == "specific_location (province/city)":
+        #   outputs = output_format_str.split(", ")
+        #   extracted_specific_location= metadata_list[outputs[0]]["answer"]  
+        #   specific_loc_explanation = metadata_list[outputs[0]][outputs[0]+"_explanation"]
+        # elif output_format_str == "ethnicity":
+        #   outputs = output_format_str.split(", ")
+        #   extracted_ethnicity= metadata_list[outputs[0]]["answer"]  
+        #   ethnicity_explanation = metadata_list[outputs[0]][outputs[0]+"_explanation"] 
+        print("done for again")  
+    print("total cost: ", total_query_cost)        
+    return extracted_country, extracted_type, method_used, country_explanation, sample_type_explanation, total_query_cost