diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -25,25 +25,6 @@ from urllib.parse import urlparse, parse_qs, urlencode, urlunparse import webbrowser import zipfile -# Local Module Imports (Libraries specific to this project) -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'App_Function_Libraries'))) -from App_Function_Libraries import * -from App_Function_Libraries.Web_UI_Lib import * -from App_Function_Libraries.Article_Extractor_Lib import * -from App_Function_Libraries.Article_Summarization_Lib import * -from App_Function_Libraries.Audio_Transcription_Lib import * -from App_Function_Libraries.Chunk_Lib import * -from App_Function_Libraries.Diarization_Lib import * -from App_Function_Libraries.Local_File_Processing_Lib import * -from App_Function_Libraries.Local_LLM_Inference_Engine_Lib import * -from App_Function_Libraries.Local_Summarization_Lib import * -from App_Function_Libraries.Summarization_General_Lib import * -from App_Function_Libraries.System_Checks_Lib import * -from App_Function_Libraries.Tokenization_Methods_Lib import * -from App_Function_Libraries.Video_DL_Ingestion_Lib import * -#from App_Function_Libraries.Web_UI_Lib import * - - # 3rd-Party Module Imports from bs4 import BeautifulSoup import gradio as gr @@ -334,11 +315,348 @@ def print_hello(): ####################################################################################################################### # Online Article Extraction / Handling # +# Article_Extractor_Lib.py +######################################### +# Article Extraction Library +# This library is used to handle scraping and extraction of articles from web pages. +# Currently, uses a combination of beatifulsoup4 and trafilatura to extract article text. +# Firecrawl would be a better option for this, but it is not yet implemented. +#### + +#################### # Function List +# # 1. get_page_title(url) # 2. get_article_text(url) # 3. get_article_title(article_url_arg) # +#################### + + + +# Import necessary libraries +import os +import logging +import huggingface_hub +import tokenizers +import torchvision +import transformers +# 3rd-Party Imports +import asyncio +import playwright +from playwright.async_api import async_playwright +from bs4 import BeautifulSoup +import requests +import trafilatura +# Import Local +import summarize +def get_page_title(url: str) -> str: + try: + response = requests.get(url) + response.raise_for_status() + soup = BeautifulSoup(response.text, 'html.parser') + title_tag = soup.find('title') + return title_tag.string.strip() if title_tag else "Untitled" + except requests.RequestException as e: + logging.error(f"Error fetching page title: {e}") + return "Untitled" + + +def get_artice_title(article_url_arg: str) -> str: + # Use beautifulsoup to get the page title - Really should be using ytdlp for this.... + article_title = get_page_title(article_url_arg) + + +def scrape_article(url): + async def fetch_html(url: str) -> str: + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + context = await browser.new_context( + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3") + page = await context.new_page() + await page.goto(url) + await page.wait_for_load_state("networkidle") # Wait for the network to be idle + content = await page.content() + await browser.close() + return content + + def extract_article_data(html: str) -> dict: + downloaded = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False) + if downloaded: + metadata = trafilatura.extract_metadata(html) + if metadata: + return { + 'title': metadata.title if metadata.title else 'N/A', + 'author': metadata.author if metadata.author else 'N/A', + 'content': downloaded, + 'date': metadata.date if metadata.date else 'N/A', + } + else: + print("Metadata extraction failed.") + return None + else: + print("Content extraction failed.") + return None + + def convert_html_to_markdown(html: str) -> str: + soup = BeautifulSoup(html, 'html.parser') + # Convert each paragraph to markdown + for para in soup.find_all('p'): + para.append('\n') # Add a newline at the end of each paragraph for markdown separation + + # Use .get_text() with separator to keep paragraph separation + text = soup.get_text(separator='\n\n') + + return text + + async def fetch_and_extract_article(url: str): + html = await fetch_html(url) + print("HTML Content:", html[:500]) # Print first 500 characters of the HTML for inspection + article_data = extract_article_data(html) + if article_data: + article_data['content'] = convert_html_to_markdown(article_data['content']) + return article_data + else: + return None + + # Using asyncio.run to handle event loop creation and execution + article_data = asyncio.run(fetch_and_extract_article(url)) + return article_data + +# +# +####################################################################################################################### +# +# +# Article_Summarization_Lib.py + + +# Import necessary libraries +import datetime +from datetime import datetime +import json +import os +import logging +# 3rd-Party Imports +import bs4 +import huggingface_hub +import tokenizers +import torchvision +import transformers +# Local Imports +import summarize +import summarize + + + + +def ingest_article_to_db(url, title, author, content, keywords, summary, ingestion_date, custom_prompt): + try: + # Check if content is not empty or whitespace + if not content.strip(): + raise ValueError("Content is empty.") + + db = Database() + create_tables() + keyword_list = keywords.split(',') if keywords else ["default"] + keyword_str = ', '.join(keyword_list) + + # Set default values for missing fields + url = url or 'Unknown' + title = title or 'Unknown' + author = author or 'Unknown' + keywords = keywords or 'default' + summary = summary or 'No summary available' + ingestion_date = ingestion_date or datetime.datetime.now().strftime('%Y-%m-%d') + + # Log the values of all fields before calling add_media_with_keywords + logging.debug(f"URL: {url}") + logging.debug(f"Title: {title}") + logging.debug(f"Author: {author}") + logging.debug(f"Content: {content[:50]}... (length: {len(content)})") # Log first 50 characters of content + logging.debug(f"Keywords: {keywords}") + logging.debug(f"Summary: {summary}") + logging.debug(f"Ingestion Date: {ingestion_date}") + logging.debug(f"Custom Prompt: {custom_prompt}") + + # Check if any required field is empty and log the specific missing field + if not url: + logging.error("URL is missing.") + raise ValueError("URL is missing.") + if not title: + logging.error("Title is missing.") + raise ValueError("Title is missing.") + if not content: + logging.error("Content is missing.") + raise ValueError("Content is missing.") + if not keywords: + logging.error("Keywords are missing.") + raise ValueError("Keywords are missing.") + if not summary: + logging.error("Summary is missing.") + raise ValueError("Summary is missing.") + if not ingestion_date: + logging.error("Ingestion date is missing.") + raise ValueError("Ingestion date is missing.") + if not custom_prompt: + logging.error("Custom prompt is missing.") + raise ValueError("Custom prompt is missing.") + + # Add media with keywords to the database + result = add_media_with_keywords( + url=url, + title=title, + media_type='article', + content=content, + keywords=keyword_str or "article_default", + prompt=custom_prompt or None, + summary=summary or "No summary generated", + transcription_model=None, # or some default value if applicable + author=author or 'Unknown', + ingestion_date=ingestion_date + ) + return result + except Exception as e: + logging.error(f"Failed to ingest article to the database: {e}") + return str(e) + + +def scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_article_title): + # Step 1: Scrape the article + article_data = scrape_article(url) + print(f"Scraped Article Data: {article_data}") # Debugging statement + if not article_data: + return "Failed to scrape the article." + + # Use the custom title if provided, otherwise use the scraped title + title = custom_article_title.strip() if custom_article_title else article_data.get('title', 'Untitled') + author = article_data.get('author', 'Unknown') + content = article_data.get('content', '') + ingestion_date = datetime.now().strftime('%Y-%m-%d') + + print(f"Title: {title}, Author: {author}, Content Length: {len(content)}") # Debugging statement + + # Custom prompt for the article + article_custom_prompt = custom_prompt_arg or "Summarize this article." + + # Step 2: Summarize the article + summary = None + if api_name: + logging.debug(f"Article_Summarizer: Summarization being performed by {api_name}") + + # Sanitize filename for saving the JSON file + sanitized_title = sanitize_filename(title) + json_file_path = os.path.join("Results", f"{sanitized_title}_segments.json") + + with open(json_file_path, 'w') as json_file: + json.dump([{'text': content}], json_file, indent=2) + + try: + if api_name.lower() == 'openai': + openai_api_key = api_key if api_key else config.get('API', 'openai_api_key', fallback=None) + logging.debug(f"Article_Summarizer: trying to summarize with openAI") + summary = summarize_with_openai(openai_api_key, json_file_path, article_custom_prompt) + elif api_name.lower() == "anthropic": + anthropic_api_key = api_key if api_key else config.get('API', 'anthropic_api_key', fallback=None) + logging.debug(f"Article_Summarizer: Trying to summarize with anthropic") + summary = summarize_with_claude(anthropic_api_key, json_file_path, anthropic_model, + custom_prompt_arg=article_custom_prompt) + elif api_name.lower() == "cohere": + cohere_api_key = api_key if api_key else config.get('API', 'cohere_api_key', fallback=None) + logging.debug(f"Article_Summarizer: Trying to summarize with cohere") + summary = summarize_with_cohere(cohere_api_key, json_file_path, cohere_model, + custom_prompt_arg=article_custom_prompt) + elif api_name.lower() == "groq": + groq_api_key = api_key if api_key else config.get('API', 'groq_api_key', fallback=None) + logging.debug(f"Article_Summarizer: Trying to summarize with Groq") + summary = summarize_with_groq(groq_api_key, json_file_path, groq_model, + custom_prompt_arg=article_custom_prompt) + elif api_name.lower() == "llama": + llama_token = api_key if api_key else config.get('API', 'llama_api_key', fallback=None) + llama_ip = llama_api_IP + logging.debug(f"Article_Summarizer: Trying to summarize with Llama.cpp") + summary = summarize_with_llama(llama_ip, json_file_path, llama_token, article_custom_prompt) + elif api_name.lower() == "kobold": + kobold_token = api_key if api_key else config.get('API', 'kobold_api_key', fallback=None) + kobold_ip = kobold_api_IP + logging.debug(f"Article_Summarizer: Trying to summarize with kobold.cpp") + summary = summarize_with_kobold(kobold_ip, json_file_path, kobold_token, article_custom_prompt) + elif api_name.lower() == "ooba": + ooba_token = api_key if api_key else config.get('API', 'ooba_api_key', fallback=None) + ooba_ip = ooba_api_IP + logging.debug(f"Article_Summarizer: Trying to summarize with oobabooga") + summary = summarize_with_oobabooga(ooba_ip, json_file_path, ooba_token, article_custom_prompt) + elif api_name.lower() == "tabbyapi": + tabbyapi_key = api_key if api_key else config.get('API', 'tabby_api_key', fallback=None) + tabbyapi_ip = tabby_api_IP + logging.debug(f"Article_Summarizer: Trying to summarize with tabbyapi") + tabby_model = summarize.llm_model + summary = summarize_with_tabbyapi(tabbyapi_key, tabbyapi_ip, json_file_path, tabby_model, + article_custom_prompt) + elif api_name.lower() == "vllm": + logging.debug(f"Article_Summarizer: Trying to summarize with VLLM") + summary = summarize_with_vllm(vllm_api_url, vllm_api_key, summarize.llm_model, json_file_path, + article_custom_prompt) + elif api_name.lower() == "huggingface": + huggingface_api_key = api_key if api_key else config.get('API', 'huggingface_api_key', fallback=None) + logging.debug(f"Article_Summarizer: Trying to summarize with huggingface") + summary = summarize_with_huggingface(huggingface_api_key, json_file_path, article_custom_prompt) + elif api_name.lower() == "openrouter": + openrouter_api_key = api_key if api_key else config.get('API', 'openrouter_api_key', fallback=None) + logging.debug(f"Article_Summarizer: Trying to summarize with openrouter") + summary = summarize_with_openrouter(openrouter_api_key, json_file_path, article_custom_prompt) + except requests.exceptions.ConnectionError as e: + logging.error(f"Connection error while trying to summarize with {api_name}: {str(e)}") + + if summary: + logging.info(f"Article_Summarizer: Summary generated using {api_name} API") + save_summary_to_file(summary, json_file_path) + else: + summary = "Summary not available" + logging.warning(f"Failed to generate summary using {api_name} API") + + else: + summary = "Article Summarization: No API provided for summarization." + + print(f"Summary: {summary}") # Debugging statement + + # Step 3: Ingest the article into the database + ingestion_result = ingest_article_to_db(url, title, author, content, keywords, summary, ingestion_date, + article_custom_prompt) + + return f"Title: {title}\nAuthor: {author}\nSummary: {summary}\nIngestion Result: {ingestion_result}" + + +def ingest_unstructured_text(text, custom_prompt, api_name, api_key, keywords, custom_article_title): + title = custom_article_title.strip() if custom_article_title else "Unstructured Text" + author = "Unknown" + ingestion_date = datetime.now().strftime('%Y-%m-%d') + + # Summarize the unstructured text + if api_name: + json_file_path = f"Results/{title.replace(' ', '_')}_segments.json" + with open(json_file_path, 'w') as json_file: + json.dump([{'text': text}], json_file, indent=2) + + if api_name.lower() == 'openai': + summary = summarize_with_openai(api_key, json_file_path, custom_prompt) + # Add other APIs as needed + else: + summary = "Unsupported API." + else: + summary = "No API provided for summarization." + + # Ingest the unstructured text into the database + ingestion_result = ingest_article_to_db('Unstructured Text', title, author, text, keywords, summary, ingestion_date, + custom_prompt) + return f"Title: {title}\nSummary: {summary}\nIngestion Result: {ingestion_result}" + + + +# +# +####################################################################################################################### +# # ####################################################################################################################### @@ -369,6 +687,270 @@ def print_hello(): # Function List # 1. convert_to_wav(video_file_path, offset=0, overwrite=False) # 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False) + + +# Audio_Transcription_Lib.py +######################################### +# Transcription Library +# This library is used to perform transcription of audio files. +# Currently, uses faster_whisper for transcription. +# +#### +import configparser +#################### +# Function List +# +# 1. convert_to_wav(video_file_path, offset=0, overwrite=False) +# 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False) +# +#################### + + +# Import necessary libraries to run solo for testing +import json +import logging +import os +import sys +import subprocess +import time +# Import Local + +####################################################################################################################### +# Function Definitions +# + +# Convert video .m4a into .wav using ffmpeg +# ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav" +# https://www.gyan.dev/ffmpeg/builds/ +# + + +# os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"') +def convert_to_wav(video_file_path, offset=0, overwrite=False): + out_path = os.path.splitext(video_file_path)[0] + ".wav" + + if os.path.exists(out_path) and not overwrite: + print(f"File '{out_path}' already exists. Skipping conversion.") + logging.info(f"Skipping conversion as file already exists: {out_path}") + return out_path + print("Starting conversion process of .m4a to .WAV") + out_path = os.path.splitext(video_file_path)[0] + ".wav" + + try: + if os.name == "nt": + logging.debug("ffmpeg being ran on windows") + + if sys.platform.startswith('win'): + ffmpeg_cmd = ".\\Bin\\ffmpeg.exe" + logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}") + else: + ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems + + command = [ + ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists + "-ss", "00:00:00", # Start at the beginning of the video + "-i", video_file_path, + "-ar", "16000", # Audio sample rate + "-ac", "1", # Number of audio channels + "-c:a", "pcm_s16le", # Audio codec + out_path + ] + try: + # Redirect stdin from null device to prevent ffmpeg from waiting for input + with open(os.devnull, 'rb') as null_file: + result = subprocess.run(command, stdin=null_file, text=True, capture_output=True) + if result.returncode == 0: + logging.info("FFmpeg executed successfully") + logging.debug("FFmpeg output: %s", result.stdout) + else: + logging.error("Error in running FFmpeg") + logging.error("FFmpeg stderr: %s", result.stderr) + raise RuntimeError(f"FFmpeg error: {result.stderr}") + except Exception as e: + logging.error("Error occurred - ffmpeg doesn't like windows") + raise RuntimeError("ffmpeg failed") + elif os.name == "posix": + os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"') + else: + raise RuntimeError("Unsupported operating system") + logging.info("Conversion to WAV completed: %s", out_path) + except subprocess.CalledProcessError as e: + logging.error("Error executing FFmpeg command: %s", str(e)) + raise RuntimeError("Error converting video file to WAV") + except Exception as e: + logging.error("speech-to-text: Error transcribing audio: %s", str(e)) + return {"error": str(e)} + return out_path + + +# Transcribe .wav into .segments.json +def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False): + logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model) + from faster_whisper import WhisperModel + # Retrieve processing choice from the configuration file + config = configparser.ConfigParser() + config.read('config.txt') + processing_choice = config.get('Processing', 'processing_choice', fallback='cpu') + model = WhisperModel(whisper_model, device=f"{processing_choice}") + time_start = time.time() + if audio_file_path is None: + raise ValueError("speech-to-text: No audio file provided") + logging.info("speech-to-text: Audio file path: %s", audio_file_path) + + try: + _, file_ending = os.path.splitext(audio_file_path) + out_file = audio_file_path.replace(file_ending, ".segments.json") + prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json") + if os.path.exists(out_file): + logging.info("speech-to-text: Segments file already exists: %s", out_file) + with open(out_file) as f: + global segments + segments = json.load(f) + return segments + + logging.info('speech-to-text: Starting transcription...') + options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter) + transcribe_options = dict(task="transcribe", **options) + segments_raw, info = model.transcribe(audio_file_path, **transcribe_options) + + segments = [] + for segment_chunk in segments_raw: + chunk = { + "Time_Start": segment_chunk.start, + "Time_End": segment_chunk.end, + "Text": segment_chunk.text + } + logging.debug("Segment: %s", chunk) + segments.append(chunk) + logging.info("speech-to-text: Transcription completed with faster_whisper") + + # Save prettified JSON + with open(prettified_out_file, 'w') as f: + json.dump(segments, f, indent=2) + + # Save non-prettified JSON + with open(out_file, 'w') as f: + json.dump(segments, f) + + except Exception as e: + logging.error("speech-to-text: Error transcribing audio: %s", str(e)) + raise RuntimeError("speech-to-text: Error transcribing audio") + return segments + + + +# +# +####################################################################################################################### +# Chunk Lib +# +# + +# from transformers import GPT2Tokenizer +# import nltk +# import re + + +# +# # FIXME - Make sure it only downloads if it already exists, and does a check first. +# # Ensure NLTK data is downloaded +# def ntlk_prep(): +# nltk.download('punkt') +# +# # Load GPT2 tokenizer +# tokenizer = GPT2Tokenizer.from_pretrained("gpt2") +# +# +# def load_document(file_path): +# with open(file_path, 'r') as file: +# text = file.read() +# return re.sub('\s+', ' ', text).strip() +# +# +# # Chunk based on maximum number of words, using ' ' (space) as a delimiter +# def chunk_text_by_words(text, max_words=300): +# words = text.split() +# chunks = [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)] +# return chunks +# +# +# # Chunk based on sentences, not exceeding a max amount, using nltk +# def chunk_text_by_sentences(text, max_sentences=10): +# sentences = nltk.tokenize.sent_tokenize(text) +# chunks = [' '.join(sentences[i:i + max_sentences]) for i in range(0, len(sentences), max_sentences)] +# return chunks +# +# +# # Chunk text by paragraph, marking paragraphs by (delimiter) '\n\n' +# def chunk_text_by_paragraphs(text, max_paragraphs=5): +# paragraphs = text.split('\n\n') +# chunks = ['\n\n'.join(paragraphs[i:i + max_paragraphs]) for i in range(0, len(paragraphs), max_paragraphs)] +# return chunks +# +# +# # Naive chunking based on token count +# def chunk_text_by_tokens(text, max_tokens=1000): +# tokens = tokenizer.encode(text) +# chunks = [tokenizer.decode(tokens[i:i + max_tokens]) for i in range(0, len(tokens), max_tokens)] +# return chunks +# +# +# # Hybrid approach, chunk each sentence while ensuring total token size does not exceed a maximum number +# def chunk_text_hybrid(text, max_tokens=1000): +# sentences = nltk.tokenize.sent_tokenize(text) +# chunks = [] +# current_chunk = [] +# current_length = 0 +# +# for sentence in sentences: +# tokens = tokenizer.encode(sentence) +# if current_length + len(tokens) <= max_tokens: +# current_chunk.append(sentence) +# current_length += len(tokens) +# else: +# chunks.append(' '.join(current_chunk)) +# current_chunk = [sentence] +# current_length = len(tokens) +# +# if current_chunk: +# chunks.append(' '.join(current_chunk)) +# +# return chunks + + +# Sample text for testing +sample_text = """ +Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence +concerned with the interactions between computers and human language, in particular how to program computers +to process and analyze large amounts of natural language data. The result is a computer capable of "understanding" +the contents of documents, including the contextual nuances of the language within them. The technology can then +accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves. + +Challenges in natural language processing frequently involve speech recognition, natural language understanding, +and natural language generation. + +Natural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled +"Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence. +""" + +# Example usage of different chunking methods +# print("Chunking by words:") +# print(chunk_text_by_words(sample_text, max_words=50)) +# +# print("\nChunking by sentences:") +# print(chunk_text_by_sentences(sample_text, max_sentences=2)) +# +# print("\nChunking by paragraphs:") +# print(chunk_text_by_paragraphs(sample_text, max_paragraphs=1)) +# +# print("\nChunking by tokens:") +# print(chunk_text_by_tokens(sample_text, max_tokens=50)) +# +# print("\nHybrid chunking:") +# print(chunk_text_hybrid(sample_text, max_tokens=50)) + + + # # ####################################################################################################################### @@ -379,6 +961,2519 @@ def print_hello(): # # Function List 1. speaker_diarize(video_file_path, segments, embedding_model = "pyannote/embedding", # embedding_size=512, num_speakers=0) + +# Local_File_Processing_Lib.py +######################################### +# Local File Processing and File Path Handling Library +# This library is used to handle processing local filepaths and URLs. +# It checks for the OS, the availability of the GPU, and the availability of the ffmpeg executable. +# If the GPU is available, it asks the user if they would like to use it for processing. +# If ffmpeg is not found, it asks the user if they would like to download it. +# The script will exit if the user chooses not to download ffmpeg. +#### + +#################### +# Function List +# +# 1. read_paths_from_file(file_path) +# 2. process_path(path) +# 3. process_local_file(file_path) +# 4. read_paths_from_file(file_path: str) -> List[str] +# +#################### + +# Import necessary libraries +import os +import logging + + +# Local_LLM_Inference_Engine_Lib.py +######################################### +# Local LLM Inference Engine Library +# This library is used to handle downloading, configuring, and launching the Local LLM Inference Engine +# via (llama.cpp via llamafile) +# +# +#### +import atexit +import hashlib +#################### +# Function List +# +# 1. download_latest_llamafile(repo, asset_name_prefix, output_filename) +# 2. download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5) +# 3. verify_checksum(file_path, expected_checksum) +# 4. cleanup_process() +# 5. signal_handler(sig, frame) +# 6. local_llm_function() +# 7. launch_in_new_terminal_windows(executable, args) +# 8. launch_in_new_terminal_linux(executable, args) +# 9. launch_in_new_terminal_mac(executable, args) +# +#################### + +# Import necessary libraries +import json +import logging +from multiprocessing import Process as MpProcess +import requests +import sys +import os +# Import 3rd-pary Libraries +import gradio as gr +from tqdm import tqdm + + + +# Local_Summarization_Lib.py +######################################### +# Local Summarization Library +# This library is used to perform summarization with a 'local' inference engine. +# +#### + +#################### +# Function List +# +# 1. summarize_with_local_llm(file_path, custom_prompt_arg) +# 2. summarize_with_llama(api_url, file_path, token, custom_prompt) +# 3. summarize_with_kobold(api_url, file_path, kobold_api_token, custom_prompt) +# 4. summarize_with_oobabooga(api_url, file_path, ooba_api_token, custom_prompt) +# 5. summarize_with_vllm(vllm_api_url, vllm_api_key_function_arg, llm_model, text, vllm_custom_prompt_function_arg) +# 6. summarize_with_tabbyapi(tabby_api_key, tabby_api_IP, text, tabby_model, custom_prompt) +# 7. save_summary_to_file(summary, file_path) +# +# +#################### + + +# Import necessary libraries +import os +import logging +from typing import Callable + + + +# Old_Chunking_Lib.py +######################################### +# Old Chunking Library +# This library is used to handle chunking of text for summarization. +# +#### + + + +#################### +# Function List +# +# 1. chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str] +# 2. summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int, words_per_second: int) -> str +# 3. get_chat_completion(messages, model='gpt-4-turbo') +# 4. chunk_on_delimiter(input_string: str, max_tokens: int, delimiter: str) -> List[str] +# 5. combine_chunks_with_no_minimum(chunks: List[str], max_tokens: int, chunk_delimiter="\n\n", header: Optional[str] = None, add_ellipsis_for_overflow=False) -> Tuple[List[str], List[int]] +# 6. rolling_summarize(text: str, detail: float = 0, model: str = 'gpt-4-turbo', additional_instructions: Optional[str] = None, minimum_chunk_size: Optional[int] = 500, chunk_delimiter: str = ".", summarize_recursively=False, verbose=False) +# 7. chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str] +# 8. summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int, words_per_second: int) -> str +# +#################### + +# Import necessary libraries +import os +from typing import Optional + +# Import 3rd party +import openai +from openai import OpenAI + + + +import csv +import logging +import os +import re +import sqlite3 +import time +from contextlib import contextmanager +from datetime import datetime +from typing import List, Tuple + +import gradio as gr +import pandas as pd + +# Import Local + + + + + +# Summarization_General_Lib.py +######################################### +# General Summarization Library +# This library is used to perform summarization. +# +#### +import configparser +#################### +# Function List +# +# 1. extract_text_from_segments(segments: List[Dict]) -> str +# 2. summarize_with_openai(api_key, file_path, custom_prompt_arg) +# 3. summarize_with_claude(api_key, file_path, model, custom_prompt_arg, max_retries=3, retry_delay=5) +# 4. summarize_with_cohere(api_key, file_path, model, custom_prompt_arg) +# 5. summarize_with_groq(api_key, file_path, model, custom_prompt_arg) +# +# +#################### + + +# Import necessary libraries +import os +import logging +import time +import requests +from typing import List, Dict +import json +import configparser +from requests import RequestException + + + + +# System_Checks_Lib.py +######################################### +# System Checks Library +# This library is used to check the system for the necessary dependencies to run the script. +# It checks for the OS, the availability of the GPU, and the availability of the ffmpeg executable. +# If the GPU is available, it asks the user if they would like to use it for processing. +# If ffmpeg is not found, it asks the user if they would like to download it. +# The script will exit if the user chooses not to download ffmpeg. +#### + +#################### +# Function List +# +# 1. platform_check() +# 2. cuda_check() +# 3. decide_cpugpu() +# 4. check_ffmpeg() +# 5. download_ffmpeg() +# +#################### + + + + +# Import necessary libraries +import os +import platform +import subprocess +import shutil +import zipfile +import logging + + + + + + +# Video_DL_Ingestion_Lib.py +######################################### +# Video Downloader and Ingestion Library +# This library is used to handle downloading videos from YouTube and other platforms. +# It also handles the ingestion of the videos into the database. +# It uses yt-dlp to extract video information and download the videos. +#### + +#################### +# Function List +# +# 1. get_video_info(url) +# 2. create_download_directory(title) +# 3. sanitize_filename(title) +# 4. normalize_title(title) +# 5. get_youtube(video_url) +# 6. get_playlist_videos(playlist_url) +# 7. download_video(video_url, download_path, info_dict, download_video_flag) +# 8. save_to_file(video_urls, filename) +# 9. save_summary_to_file(summary, file_path) +# 10. process_url(url, num_speakers, whisper_model, custom_prompt, offset, api_name, api_key, vad_filter, download_video, download_audio, rolling_summarization, detail_level, question_box, keywords, chunk_summarization, chunk_duration_input, words_per_second_input) +# +# +#################### + + +# Import necessary libraries to run solo for testing +from datetime import datetime +import json +import logging +import os +import re +import subprocess +import sys +import unicodedata +# 3rd-Party Imports +import yt_dlp + +server_mode = False +share_public = False + + +####################################################################################################################### +# Function Definitions +# + +def get_video_info(url: str) -> dict: + ydl_opts = { + 'quiet': True, + 'no_warnings': True, + 'skip_download': True, + } + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + try: + info_dict = ydl.extract_info(url, download=False) + return info_dict + except Exception as e: + logging.error(f"Error extracting video info: {e}") + return None + + +def create_download_directory(title): + base_dir = "Results" + # Remove characters that are illegal in Windows filenames and normalize + safe_title = normalize_title(title) + logging.debug(f"{title} successfully normalized") + session_path = os.path.join(base_dir, safe_title) + if not os.path.exists(session_path): + os.makedirs(session_path, exist_ok=True) + logging.debug(f"Created directory for downloaded video: {session_path}") + else: + logging.debug(f"Directory already exists for downloaded video: {session_path}") + return session_path + + +def sanitize_filename(title, max_length=255): + # Remove invalid path characters + title = re.sub(r'[\\/*?:"<>|]', "", title) + # Truncate long titles to avoid filesystem errors + return title[:max_length].rstrip() + + +def normalize_title(title): + # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters + title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii') + title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?', + '').replace( + '<', '').replace('>', '').replace('|', '') + return title + + +def get_youtube(video_url): + ydl_opts = { + 'format': 'bestaudio[ext=m4a]', + 'noplaylist': False, + 'quiet': True, + 'extract_flat': True + } + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + logging.debug("About to extract youtube info") + info_dict = ydl.extract_info(video_url, download=False) + logging.debug("Youtube info successfully extracted") + return info_dict + + +def get_playlist_videos(playlist_url): + ydl_opts = { + 'extract_flat': True, + 'skip_download': True, + 'quiet': True + } + + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(playlist_url, download=False) + + if 'entries' in info: + video_urls = [entry['url'] for entry in info['entries']] + playlist_title = info['title'] + return video_urls, playlist_title + else: + print("No videos found in the playlist.") + return [], None + + +def download_video(video_url, download_path, info_dict, download_video_flag): + global video_file_path, ffmpeg_path + global audio_file_path + + # Normalize Video Title name + logging.debug("About to normalize downloaded video title") + normalized_video_title = normalize_title(info_dict['title']) + video_file_path = os.path.join(download_path, f"{normalized_video_title}.{info_dict['ext']}") + + # Check for existence of video file + if os.path.exists(video_file_path): + logging.info(f"Video file already exists: {video_file_path}") + return video_file_path + + # Setup path handling for ffmpeg on different OSs + if sys.platform.startswith('win'): + ffmpeg_path = os.path.join(os.getcwd(), 'Bin', 'ffmpeg.exe') + elif sys.platform.startswith('linux'): + ffmpeg_path = 'ffmpeg' + elif sys.platform.startswith('darwin'): + ffmpeg_path = 'ffmpeg' + + if download_video_flag: + video_file_path = os.path.join(download_path, f"{normalized_video_title}.mp4") + + # Set options for video and audio + ydl_opts_video = { + 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]', + 'outtmpl': video_file_path, + 'ffmpeg_location': ffmpeg_path + } + + with yt_dlp.YoutubeDL(ydl_opts_video) as ydl: + logging.debug("yt_dlp: About to download video with youtube-dl") + ydl.download([video_url]) + logging.debug("yt_dlp: Video successfully downloaded with youtube-dl") + return video_file_path + + else: + return None + + +def save_to_file(video_urls, filename): + with open(filename, 'w') as file: + file.write('\n'.join(video_urls)) + print(f"Video URLs saved to {filename}") + +# +# +####################################################################################################################### + + + +# + +def openai_tokenize(text: str) -> List[str]: + encoding = tiktoken.encoding_for_model('gpt-4-turbo') + return encoding.encode(text) + +def platform_check(): + global userOS + if platform.system() == "Linux": + print("Linux OS detected \n Running Linux appropriate commands") + userOS = "Linux" + elif platform.system() == "Windows": + print("Windows OS detected \n Running Windows appropriate commands") + userOS = "Windows" + else: + print("Other OS detected \n Maybe try running things manually?") + exit() + + +# Check for NVIDIA GPU and CUDA availability +def cuda_check(): + global processing_choice + try: + # Run nvidia-smi to capture its output + nvidia_smi_output = subprocess.check_output("nvidia-smi", shell=True).decode() + + # Look for CUDA version in the output + if "CUDA Version" in nvidia_smi_output: + cuda_version = next( + (line.split(":")[-1].strip() for line in nvidia_smi_output.splitlines() if "CUDA Version" in line), + "Not found") + print(f"NVIDIA GPU with CUDA Version {cuda_version} is available.") + processing_choice = "cuda" + else: + print("CUDA is not installed or configured correctly.") + processing_choice = "cpu" + + except subprocess.CalledProcessError as e: + print(f"Failed to run 'nvidia-smi': {str(e)}") + processing_choice = "cpu" + except Exception as e: + print(f"An error occurred: {str(e)}") + processing_choice = "cpu" + + # Optionally, check for the CUDA_VISIBLE_DEVICES env variable as an additional check + if "CUDA_VISIBLE_DEVICES" in os.environ: + print("CUDA_VISIBLE_DEVICES is set:", os.environ["CUDA_VISIBLE_DEVICES"]) + else: + print("CUDA_VISIBLE_DEVICES not set.") + + +# Ask user if they would like to use either their GPU or their CPU for transcription +def decide_cpugpu(): + global processing_choice + processing_input = input("Would you like to use your GPU or CPU for transcription? (1/cuda)GPU/(2/cpu)CPU): ") + if processing_choice == "cuda" and (processing_input.lower() == "cuda" or processing_input == "1"): + print("You've chosen to use the GPU.") + logging.debug("GPU is being used for processing") + processing_choice = "cuda" + elif processing_input.lower() == "cpu" or processing_input == "2": + print("You've chosen to use the CPU.") + logging.debug("CPU is being used for processing") + processing_choice = "cpu" + else: + print("Invalid choice. Please select either GPU or CPU.") + + +# check for existence of ffmpeg +def check_ffmpeg(): + if shutil.which("ffmpeg") or (os.path.exists("Bin") and os.path.isfile(".\\Bin\\ffmpeg.exe")): + logging.debug("ffmpeg found installed on the local system, in the local PATH, or in the './Bin' folder") + pass + else: + logging.debug("ffmpeg not installed on the local system/in local PATH") + print( + "ffmpeg is not installed.\n\n You can either install it manually, or through your package manager of " + "choice.\n Windows users, builds are here: https://www.gyan.dev/ffmpeg/builds/") + if userOS == "Windows": + download_ffmpeg() + elif userOS == "Linux": + print( + "You should install ffmpeg using your platform's appropriate package manager, 'apt install ffmpeg'," + "'dnf install ffmpeg' or 'pacman', etc.") + else: + logging.debug("running an unsupported OS") + print("You're running an unspported/Un-tested OS") + exit_script = input("Let's exit the script, unless you're feeling lucky? (y/n)") + if exit_script == "y" or "yes" or "1": + exit() + + +# Download ffmpeg +def download_ffmpeg(): + user_choice = input("Do you want to download ffmpeg? (y)Yes/(n)No: ") + if user_choice.lower() in ['yes', 'y', '1']: + print("Downloading ffmpeg") + url = "https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip" + response = requests.get(url) + + if response.status_code == 200: + print("Saving ffmpeg zip file") + logging.debug("Saving ffmpeg zip file") + zip_path = "ffmpeg-release-essentials.zip" + with open(zip_path, 'wb') as file: + file.write(response.content) + + logging.debug("Extracting the 'ffmpeg.exe' file from the zip") + print("Extracting ffmpeg.exe from zip file to '/Bin' folder") + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + # Find the ffmpeg.exe file within the zip + ffmpeg_path = None + for file_info in zip_ref.infolist(): + if file_info.filename.endswith("ffmpeg.exe"): + ffmpeg_path = file_info.filename + break + + if ffmpeg_path is None: + logging.error("ffmpeg.exe not found in the zip file.") + print("ffmpeg.exe not found in the zip file.") + return + + logging.debug("checking if the './Bin' folder exists, creating if not") + bin_folder = "Bin" + if not os.path.exists(bin_folder): + logging.debug("Creating a folder for './Bin', it didn't previously exist") + os.makedirs(bin_folder) + + logging.debug("Extracting 'ffmpeg.exe' to the './Bin' folder") + zip_ref.extract(ffmpeg_path, path=bin_folder) + + logging.debug("Moving 'ffmpeg.exe' to the './Bin' folder") + src_path = os.path.join(bin_folder, ffmpeg_path) + dst_path = os.path.join(bin_folder, "ffmpeg.exe") + shutil.move(src_path, dst_path) + + logging.debug("Removing ffmpeg zip file") + print("Deleting zip file (we've already extracted ffmpeg.exe, no worries)") + os.remove(zip_path) + + logging.debug("ffmpeg.exe has been downloaded and extracted to the './Bin' folder.") + print("ffmpeg.exe has been successfully downloaded and extracted to the './Bin' folder.") + else: + logging.error("Failed to download the zip file.") + print("Failed to download the zip file.") + else: + logging.debug("User chose to not download ffmpeg") + print("ffmpeg will not be downloaded.") + +# +# +####################################################################################################################### + + + +# Read configuration from file +config = configparser.ConfigParser() +config.read('../config.txt') + +# API Keys +anthropic_api_key = config.get('API', 'anthropic_api_key', fallback=None) +logging.debug(f"Loaded Anthropic API Key: {anthropic_api_key}") + +cohere_api_key = config.get('API', 'cohere_api_key', fallback=None) +logging.debug(f"Loaded cohere API Key: {cohere_api_key}") + +groq_api_key = config.get('API', 'groq_api_key', fallback=None) +logging.debug(f"Loaded groq API Key: {groq_api_key}") + +openai_api_key = config.get('API', 'openai_api_key', fallback=None) +logging.debug(f"Loaded openAI Face API Key: {openai_api_key}") + +huggingface_api_key = config.get('API', 'huggingface_api_key', fallback=None) +logging.debug(f"Loaded HuggingFace Face API Key: {huggingface_api_key}") + +openrouter_api_token = config.get('API', 'openrouter_api_token', fallback=None) +logging.debug(f"Loaded OpenRouter API Key: {openrouter_api_token}") + +# Models +anthropic_model = config.get('API', 'anthropic_model', fallback='claude-3-sonnet-20240229') +cohere_model = config.get('API', 'cohere_model', fallback='command-r-plus') +groq_model = config.get('API', 'groq_model', fallback='llama3-70b-8192') +openai_model = config.get('API', 'openai_model', fallback='gpt-4-turbo') +huggingface_model = config.get('API', 'huggingface_model', fallback='CohereForAI/c4ai-command-r-plus') +openrouter_model = config.get('API', 'openrouter_model', fallback='mistralai/mistral-7b-instruct:free') + + +####################################################################################################################### +# Function Definitions +# + +# FIXME +# def extract_text_from_segments(segments: List[Dict]) -> str: +# """Extract text from segments.""" +# return " ".join([segment['text'] for segment in segments]) + + +def extract_text_from_segments(segments): + logging.debug(f"Segments received: {segments}") + logging.debug(f"Type of segments: {type(segments)}") + + text = "" + for segment in segments: + logging.debug(f"Current segment: {segment}") + logging.debug(f"Type of segment: {type(segment)}") + text += segment['Text'] + " " + return text.strip() + + +def summarize_with_openai(api_key, json_file_path, custom_prompt_arg): + try: + logging.debug("openai: Loading json data for summarization") + with open(json_file_path, 'r') as file: + data = json.load(file) + + logging.debug(f"openai: Loaded data: {data}") + logging.debug(f"openai: Type of data: {type(data)}") + + if isinstance(data, dict) and 'summary' in data: + # If the loaded data is a dictionary and already contains a summary, return it + logging.debug("openai: Summary already exists in the loaded data") + return data['summary'] + + # If the loaded data is a list of segment dictionaries, proceed with summarization + segments = data + + open_ai_model = openai_model or 'gpt-4-turbo' + + logging.debug("openai: Extracting text from the segments") + text = extract_text_from_segments(segments) + + headers = { + 'Authorization': f'Bearer {api_key}', + 'Content-Type': 'application/json' + } + + logging.debug(f"openai: API Key is: {api_key}") + logging.debug("openai: Preparing data + prompt for submittal") + openai_prompt = f"{text} \n\n\n\n{custom_prompt_arg}" + data = { + "model": open_ai_model, + "messages": [ + { + "role": "system", + "content": "You are a professional summarizer." + }, + { + "role": "user", + "content": openai_prompt + } + ], + "max_tokens": 8192, # Adjust tokens as needed + "temperature": 0.1 + } + logging.debug("openai: Posting request") + response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=data) + + if response.status_code == 200: + response_data = response.json() + if 'choices' in response_data and len(response_data['choices']) > 0: + summary = response_data['choices'][0]['message']['content'].strip() + logging.debug("openai: Summarization successful") + print("openai: Summarization successful.") + return summary + else: + logging.warning("openai: Summary not found in the response data") + return "openai: Summary not available" + else: + logging.debug("openai: Summarization failed") + print("openai: Failed to process summary:", response.text) + return "openai: Failed to process summary" + except Exception as e: + logging.debug("openai: Error in processing: %s", str(e)) + print("openai: Error occurred while processing summary with openai:", str(e)) + return "openai: Error occurred while processing summary" + + +def summarize_with_claude(api_key, file_path, model, custom_prompt_arg, max_retries=3, retry_delay=5): + try: + logging.debug("anthropic: Loading JSON data") + with open(file_path, 'r') as file: + segments = json.load(file) + + logging.debug("anthropic: Extracting text from the segments file") + text = extract_text_from_segments(segments) + + headers = { + 'x-api-key': api_key, + 'anthropic-version': '2023-06-01', + 'Content-Type': 'application/json' + } + + anthropic_prompt = custom_prompt_arg # Sanitize the custom prompt + logging.debug(f"anthropic: Prompt is {anthropic_prompt}") + user_message = { + "role": "user", + "content": f"{text} \n\n\n\n{anthropic_prompt}" + } + + data = { + "model": model, + "max_tokens": 4096, # max _possible_ tokens to return + "messages": [user_message], + "stop_sequences": ["\n\nHuman:"], + "temperature": 0.1, + "top_k": 0, + "top_p": 1.0, + "metadata": { + "user_id": "example_user_id", + }, + "stream": False, + "system": "You are a professional summarizer." + } + + for attempt in range(max_retries): + try: + logging.debug("anthropic: Posting request to API") + response = requests.post('https://api.anthropic.com/v1/messages', headers=headers, json=data) + + # Check if the status code indicates success + if response.status_code == 200: + logging.debug("anthropic: Post submittal successful") + response_data = response.json() + try: + summary = response_data['content'][0]['text'].strip() + logging.debug("anthropic: Summarization successful") + print("Summary processed successfully.") + return summary + except (IndexError, KeyError) as e: + logging.debug("anthropic: Unexpected data in response") + print("Unexpected response format from Claude API:", response.text) + return None + elif response.status_code == 500: # Handle internal server error specifically + logging.debug("anthropic: Internal server error") + print("Internal server error from API. Retrying may be necessary.") + time.sleep(retry_delay) + else: + logging.debug( + f"anthropic: Failed to summarize, status code {response.status_code}: {response.text}") + print(f"Failed to process summary, status code {response.status_code}: {response.text}") + return None + + except RequestException as e: + logging.error(f"anthropic: Network error during attempt {attempt + 1}/{max_retries}: {str(e)}") + if attempt < max_retries - 1: + time.sleep(retry_delay) + else: + return f"anthropic: Network error: {str(e)}" + + except FileNotFoundError as e: + logging.error(f"anthropic: File not found: {file_path}") + return f"anthropic: File not found: {file_path}" + except json.JSONDecodeError as e: + logging.error(f"anthropic: Invalid JSON format in file: {file_path}") + return f"anthropic: Invalid JSON format in file: {file_path}" + except Exception as e: + logging.error(f"anthropic: Error in processing: {str(e)}") + return f"anthropic: Error occurred while processing summary with Anthropic: {str(e)}" + + +# Summarize with Cohere +def summarize_with_cohere(api_key, file_path, model, custom_prompt_arg): + try: + logging.debug("cohere: Loading JSON data") + with open(file_path, 'r') as file: + segments = json.load(file) + + logging.debug(f"cohere: Extracting text from segments file") + text = extract_text_from_segments(segments) + + headers = { + 'accept': 'application/json', + 'content-type': 'application/json', + 'Authorization': f'Bearer {api_key}' + } + + cohere_prompt = f"{text} \n\n\n\n{custom_prompt_arg}" + logging.debug("cohere: Prompt being sent is {cohere_prompt}") + + data = { + "chat_history": [ + {"role": "USER", "message": cohere_prompt} + ], + "message": "Please provide a summary.", + "model": model, + "connectors": [{"id": "web-search"}] + } + + logging.debug("cohere: Submitting request to API endpoint") + print("cohere: Submitting request to API endpoint") + response = requests.post('https://api.cohere.ai/v1/chat', headers=headers, json=data) + response_data = response.json() + logging.debug("API Response Data: %s", response_data) + + if response.status_code == 200: + if 'text' in response_data: + summary = response_data['text'].strip() + logging.debug("cohere: Summarization successful") + print("Summary processed successfully.") + return summary + else: + logging.error("Expected data not found in API response.") + return "Expected data not found in API response." + else: + logging.error(f"cohere: API request failed with status code {response.status_code}: {response.text}") + print(f"Failed to process summary, status code {response.status_code}: {response.text}") + return f"cohere: API request failed: {response.text}" + + except Exception as e: + logging.error("cohere: Error in processing: %s", str(e)) + return f"cohere: Error occurred while processing summary with Cohere: {str(e)}" + + +# https://console.groq.com/docs/quickstart +def summarize_with_groq(api_key, file_path, model, custom_prompt_arg): + try: + logging.debug("groq: Loading JSON data") + with open(file_path, 'r') as file: + segments = json.load(file) + + logging.debug(f"groq: Extracting text from segments file") + text = extract_text_from_segments(segments) + + headers = { + 'Authorization': f'Bearer {api_key}', + 'Content-Type': 'application/json' + } + + groq_prompt = f"{text} \n\n\n\n{custom_prompt_arg}" + logging.debug("groq: Prompt being sent is {groq_prompt}") + + data = { + "messages": [ + { + "role": "user", + "content": groq_prompt + } + ], + "model": model + } + + logging.debug("groq: Submitting request to API endpoint") + print("groq: Submitting request to API endpoint") + response = requests.post('https://api.groq.com/openai/v1/chat/completions', headers=headers, json=data) + + response_data = response.json() + logging.debug("API Response Data: %s", response_data) + + if response.status_code == 200: + if 'choices' in response_data and len(response_data['choices']) > 0: + summary = response_data['choices'][0]['message']['content'].strip() + logging.debug("groq: Summarization successful") + print("Summarization successful.") + return summary + else: + logging.error("Expected data not found in API response.") + return "Expected data not found in API response." + else: + logging.error(f"groq: API request failed with status code {response.status_code}: {response.text}") + return f"groq: API request failed: {response.text}" + + except Exception as e: + logging.error("groq: Error in processing: %s", str(e)) + return f"groq: Error occurred while processing summary with groq: {str(e)}" + + +def summarize_with_openrouter(api_key, json_file_path, custom_prompt_arg): + import requests + import json + global openrouter_model + + config = configparser.ConfigParser() + file_path = 'config.txt' + + # Check if the file exists in the specified path + if os.path.exists(file_path): + config.read(file_path) + elif os.path.exists('config.txt'): # Check in the current directory + config.read('../config.txt') + else: + print("config.txt not found in the specified path or current directory.") + + openrouter_api_token = config.get('API', 'openrouter_api_token', fallback=None) + if openrouter_model is None: + openrouter_model = "mistralai/mistral-7b-instruct:free" + + openrouter_prompt = f"{json_file_path} \n\n\n\n{custom_prompt_arg}" + + try: + logging.debug("openrouter: Submitting request to API endpoint") + print("openrouter: Submitting request to API endpoint") + response = requests.post( + url="https://openrouter.ai/api/v1/chat/completions", + headers={ + "Authorization": f"Bearer {openrouter_api_token}", + }, + data=json.dumps({ + "model": f"{openrouter_model}", + "messages": [ + {"role": "user", "content": openrouter_prompt} + ] + }) + ) + + response_data = response.json() + logging.debug("API Response Data: %s", response_data) + + if response.status_code == 200: + if 'choices' in response_data and len(response_data['choices']) > 0: + summary = response_data['choices'][0]['message']['content'].strip() + logging.debug("openrouter: Summarization successful") + print("openrouter: Summarization successful.") + return summary + else: + logging.error("openrouter: Expected data not found in API response.") + return "openrouter: Expected data not found in API response." + else: + logging.error(f"openrouter: API request failed with status code {response.status_code}: {response.text}") + return f"openrouter: API request failed: {response.text}" + except Exception as e: + logging.error("openrouter: Error in processing: %s", str(e)) + return f"openrouter: Error occurred while processing summary with openrouter: {str(e)}" + +def summarize_with_huggingface(api_key, file_path, custom_prompt_arg): + logging.debug(f"huggingface: Summarization process starting...") + try: + logging.debug("huggingface: Loading json data for summarization") + with open(file_path, 'r') as file: + segments = json.load(file) + + logging.debug("huggingface: Extracting text from the segments") + logging.debug(f"huggingface: Segments: {segments}") + text = ' '.join([segment['text'] for segment in segments]) + + print(f"huggingface: lets make sure the HF api key exists...\n\t {api_key}") + headers = { + "Authorization": f"Bearer {api_key}" + } + + model = "microsoft/Phi-3-mini-128k-instruct" + API_URL = f"https://api-inference.huggingface.co/models/{model}" + + huggingface_prompt = f"{text}\n\n\n\n{custom_prompt_arg}" + logging.debug("huggingface: Prompt being sent is {huggingface_prompt}") + data = { + "inputs": text, + "parameters": {"max_length": 512, "min_length": 100} # You can adjust max_length and min_length as needed + } + + print(f"huggingface: lets make sure the HF api key is the same..\n\t {huggingface_api_key}") + + logging.debug("huggingface: Submitting request...") + + response = requests.post(API_URL, headers=headers, json=data) + + if response.status_code == 200: + summary = response.json()[0]['summary_text'] + logging.debug("huggingface: Summarization successful") + print("Summarization successful.") + return summary + else: + logging.error(f"huggingface: Summarization failed with status code {response.status_code}: {response.text}") + return f"Failed to process summary, status code {response.status_code}: {response.text}" + except Exception as e: + logging.error("huggingface: Error in processing: %s", str(e)) + print(f"Error occurred while processing summary with huggingface: {str(e)}") + return None + + # FIXME + # This is here for gradio authentication + # Its just not setup. + # def same_auth(username, password): + # return username == password + + +# +# +####################################################################################################################### + + + + + + + + +# Set up logging +#logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +#logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + + +# Custom exceptions +class DatabaseError(Exception): + pass + + +class InputError(Exception): + pass + + +# Database connection function with connection pooling +class Database: + def __init__(self, db_name=None): + self.db_name = db_name or os.getenv('DB_NAME', 'media_summary.db') + self.pool = [] + self.pool_size = 10 + + @contextmanager + def get_connection(self): + retry_count = 5 + retry_delay = 1 + conn = None + while retry_count > 0: + try: + conn = self.pool.pop() if self.pool else sqlite3.connect(self.db_name, check_same_thread=False) + yield conn + self.pool.append(conn) + return + except sqlite3.OperationalError as e: + if 'database is locked' in str(e): + logging.warning(f"Database is locked, retrying in {retry_delay} seconds...") + retry_count -= 1 + time.sleep(retry_delay) + else: + raise DatabaseError(f"Database error: {e}") + except Exception as e: + raise DatabaseError(f"Unexpected error: {e}") + finally: + # Ensure the connection is returned to the pool even on failure + if conn: + self.pool.append(conn) + raise DatabaseError("Database is locked and retries have been exhausted") + + def execute_query(self, query: str, params: Tuple = ()) -> None: + with self.get_connection() as conn: + try: + cursor = conn.cursor() + cursor.execute(query, params) + conn.commit() + except sqlite3.Error as e: + raise DatabaseError(f"Database error: {e}, Query: {query}") + +db = Database() + + +# Function to create tables with the new media schema +def create_tables() -> None: + table_queries = [ + ''' + CREATE TABLE IF NOT EXISTS Media ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url TEXT, + title TEXT NOT NULL, + type TEXT NOT NULL, + content TEXT, + author TEXT, + ingestion_date TEXT, + prompt TEXT, + summary TEXT, + transcription_model TEXT + ) + ''', + ''' + CREATE TABLE IF NOT EXISTS Keywords ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + keyword TEXT NOT NULL UNIQUE + ) + ''', + ''' + CREATE TABLE IF NOT EXISTS MediaKeywords ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + media_id INTEGER NOT NULL, + keyword_id INTEGER NOT NULL, + FOREIGN KEY (media_id) REFERENCES Media(id), + FOREIGN KEY (keyword_id) REFERENCES Keywords(id) + ) + ''', + ''' + CREATE TABLE IF NOT EXISTS MediaVersion ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + media_id INTEGER NOT NULL, + version INTEGER NOT NULL, + prompt TEXT, + summary TEXT, + created_at TEXT NOT NULL, + FOREIGN KEY (media_id) REFERENCES Media(id) + ) + ''', + ''' + CREATE TABLE IF NOT EXISTS MediaModifications ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + media_id INTEGER NOT NULL, + prompt TEXT, + summary TEXT, + modification_date TEXT, + FOREIGN KEY (media_id) REFERENCES Media(id) + ) + ''', + ''' + CREATE VIRTUAL TABLE IF NOT EXISTS media_fts USING fts5(title, content); + ''', + ''' + CREATE VIRTUAL TABLE IF NOT EXISTS keyword_fts USING fts5(keyword); + ''', + ''' + CREATE INDEX IF NOT EXISTS idx_media_title ON Media(title); + ''', + ''' + CREATE INDEX IF NOT EXISTS idx_media_type ON Media(type); + ''', + ''' + CREATE INDEX IF NOT EXISTS idx_media_author ON Media(author); + ''', + ''' + CREATE INDEX IF NOT EXISTS idx_media_ingestion_date ON Media(ingestion_date); + ''', + ''' + CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON Keywords(keyword); + ''', + ''' + CREATE INDEX IF NOT EXISTS idx_mediakeywords_media_id ON MediaKeywords(media_id); + ''', + ''' + CREATE INDEX IF NOT EXISTS idx_mediakeywords_keyword_id ON MediaKeywords(keyword_id); + ''', + ''' + CREATE INDEX IF NOT EXISTS idx_media_version_media_id ON MediaVersion(media_id); + ''' + ] + for query in table_queries: + db.execute_query(query) + +create_tables() + + +####################################################################################################################### +# Keyword-related Functions +# + +# Function to add a keyword +def add_keyword(keyword: str) -> int: + keyword = keyword.strip().lower() + with db.get_connection() as conn: + cursor = conn.cursor() + try: + cursor.execute('INSERT OR IGNORE INTO Keywords (keyword) VALUES (?)', (keyword,)) + cursor.execute('SELECT id FROM Keywords WHERE keyword = ?', (keyword,)) + keyword_id = cursor.fetchone()[0] + cursor.execute('INSERT OR IGNORE INTO keyword_fts (rowid, keyword) VALUES (?, ?)', (keyword_id, keyword)) + logging.info(f"Keyword '{keyword}' added to keyword_fts with ID: {keyword_id}") + conn.commit() + return keyword_id + except sqlite3.IntegrityError as e: + logging.error(f"Integrity error adding keyword: {e}") + raise DatabaseError(f"Integrity error adding keyword: {e}") + except sqlite3.Error as e: + logging.error(f"Error adding keyword: {e}") + raise DatabaseError(f"Error adding keyword: {e}") + + +# Function to delete a keyword +def delete_keyword(keyword: str) -> str: + keyword = keyword.strip().lower() + with db.get_connection() as conn: + cursor = conn.cursor() + try: + cursor.execute('SELECT id FROM Keywords WHERE keyword = ?', (keyword,)) + keyword_id = cursor.fetchone() + if keyword_id: + cursor.execute('DELETE FROM Keywords WHERE keyword = ?', (keyword,)) + cursor.execute('DELETE FROM keyword_fts WHERE rowid = ?', (keyword_id[0],)) + conn.commit() + return f"Keyword '{keyword}' deleted successfully." + else: + return f"Keyword '{keyword}' not found." + except sqlite3.Error as e: + raise DatabaseError(f"Error deleting keyword: {e}") + + + +# Function to add media with keywords +def add_media_with_keywords(url, title, media_type, content, keywords, prompt, summary, transcription_model, author, ingestion_date): + # Set default values for missing fields + url = url or 'Unknown' + title = title or 'Untitled' + media_type = media_type or 'Unknown' + content = content or 'No content available' + keywords = keywords or 'default' + prompt = prompt or 'No prompt available' + summary = summary or 'No summary available' + transcription_model = transcription_model or 'Unknown' + author = author or 'Unknown' + ingestion_date = ingestion_date or datetime.now().strftime('%Y-%m-%d') + + # Ensure URL is valid + if not is_valid_url(url): + url = 'localhost' + + if media_type not in ['document', 'video', 'article']: + raise InputError("Invalid media type. Allowed types: document, video, article.") + + if ingestion_date and not is_valid_date(ingestion_date): + raise InputError("Invalid ingestion date format. Use YYYY-MM-DD.") + + if not ingestion_date: + ingestion_date = datetime.now().strftime('%Y-%m-%d') + + # Split keywords correctly by comma + keyword_list = [keyword.strip().lower() for keyword in keywords.split(',')] + + logging.info(f"URL: {url}") + logging.info(f"Title: {title}") + logging.info(f"Media Type: {media_type}") + logging.info(f"Keywords: {keywords}") + logging.info(f"Content: {content}") + logging.info(f"Prompt: {prompt}") + logging.info(f"Summary: {summary}") + logging.info(f"Author: {author}") + logging.info(f"Ingestion Date: {ingestion_date}") + logging.info(f"Transcription Model: {transcription_model}") + + try: + with db.get_connection() as conn: + cursor = conn.cursor() + + # Initialize keyword_list + keyword_list = [keyword.strip().lower() for keyword in keywords.split(',')] + + # Check if media already exists + cursor.execute('SELECT id FROM Media WHERE url = ?', (url,)) + existing_media = cursor.fetchone() + + if existing_media: + media_id = existing_media[0] + logger.info(f"Existing media found with ID: {media_id}") + + # Insert new prompt and summary into MediaModifications + cursor.execute(''' + INSERT INTO MediaModifications (media_id, prompt, summary, modification_date) + VALUES (?, ?, ?, ?) + ''', (media_id, prompt, summary, ingestion_date)) + logger.info("New summary and prompt added to MediaModifications") + else: + logger.info("New media entry being created") + + # Insert new media item + cursor.execute(''' + INSERT INTO Media (url, title, type, content, author, ingestion_date, transcription_model) + VALUES (?, ?, ?, ?, ?, ?, ?) + ''', (url, title, media_type, content, author, ingestion_date, transcription_model)) + media_id = cursor.lastrowid + + # Insert keywords and associate with media item + for keyword in keyword_list: + keyword = keyword.strip().lower() + cursor.execute('INSERT OR IGNORE INTO Keywords (keyword) VALUES (?)', (keyword,)) + cursor.execute('SELECT id FROM Keywords WHERE keyword = ?', (keyword,)) + keyword_id = cursor.fetchone()[0] + cursor.execute('INSERT OR IGNORE INTO MediaKeywords (media_id, keyword_id) VALUES (?, ?)', (media_id, keyword_id)) + cursor.execute('INSERT INTO media_fts (rowid, title, content) VALUES (?, ?, ?)', (media_id, title, content)) + + # Also insert the initial prompt and summary into MediaModifications + cursor.execute(''' + INSERT INTO MediaModifications (media_id, prompt, summary, modification_date) + VALUES (?, ?, ?, ?) + ''', (media_id, prompt, summary, ingestion_date)) + + conn.commit() + + # Insert initial version of the prompt and summary + add_media_version(media_id, prompt, summary) + + return f"Media '{title}' added successfully with keywords: {', '.join(keyword_list)}" + except sqlite3.IntegrityError as e: + logger.error(f"Integrity Error: {e}") + raise DatabaseError(f"Integrity error adding media with keywords: {e}") + except sqlite3.Error as e: + logger.error(f"SQL Error: {e}") + raise DatabaseError(f"Error adding media with keywords: {e}") + except Exception as e: + logger.error(f"Unexpected Error: {e}") + raise DatabaseError(f"Unexpected error: {e}") + + +def fetch_all_keywords() -> List[str]: + try: + with db.get_connection() as conn: + cursor = conn.cursor() + cursor.execute('SELECT keyword FROM Keywords') + keywords = [row[0] for row in cursor.fetchall()] + return keywords + except sqlite3.Error as e: + raise DatabaseError(f"Error fetching keywords: {e}") + +def keywords_browser_interface(): + keywords = fetch_all_keywords() + return gr.Markdown("\n".join(f"- {keyword}" for keyword in keywords)) + +def display_keywords(): + try: + keywords = fetch_all_keywords() + return "\n".join(keywords) if keywords else "No keywords found." + except DatabaseError as e: + return str(e) + + +def export_keywords_to_csv(): + try: + keywords = fetch_all_keywords() + if not keywords: + return None, "No keywords found in the database." + + filename = "keywords.csv" + with open(filename, 'w', newline='', encoding='utf-8') as file: + writer = csv.writer(file) + writer.writerow(["Keyword"]) + for keyword in keywords: + writer.writerow([keyword]) + + return filename, f"Keywords exported to {filename}" + except Exception as e: + logger.error(f"Error exporting keywords to CSV: {e}") + return None, f"Error exporting keywords: {e}" + + +# +# +####################################################################################################################### + + + + +# Function to add a version of a prompt and summary +def add_media_version(media_id: int, prompt: str, summary: str) -> None: + try: + with db.get_connection() as conn: + cursor = conn.cursor() + + # Get the current version number + cursor.execute('SELECT MAX(version) FROM MediaVersion WHERE media_id = ?', (media_id,)) + current_version = cursor.fetchone()[0] or 0 + + # Insert the new version + cursor.execute(''' + INSERT INTO MediaVersion (media_id, version, prompt, summary, created_at) + VALUES (?, ?, ?, ?, ?) + ''', (media_id, current_version + 1, prompt, summary, datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) + conn.commit() + except sqlite3.Error as e: + raise DatabaseError(f"Error adding media version: {e}") + + +# Function to search the database with advanced options, including keyword search and full-text search +def search_db(search_query: str, search_fields: List[str], keywords: str, page: int = 1, results_per_page: int = 10): + if page < 1: + raise ValueError("Page number must be 1 or greater.") + + # Prepare keywords by splitting and trimming + keywords = [keyword.strip().lower() for keyword in keywords.split(',') if keyword.strip()] + + with db.get_connection() as conn: + cursor = conn.cursor() + offset = (page - 1) * results_per_page + + # Prepare the search conditions for general fields + search_conditions = [] + params = [] + + for field in search_fields: + if search_query: # Ensure there's a search query before adding this condition + search_conditions.append(f"Media.{field} LIKE ?") + params.append(f'%{search_query}%') + + # Prepare the conditions for keywords filtering + keyword_conditions = [] + for keyword in keywords: + keyword_conditions.append( + f"EXISTS (SELECT 1 FROM MediaKeywords mk JOIN Keywords k ON mk.keyword_id = k.id WHERE mk.media_id = Media.id AND k.keyword LIKE ?)") + params.append(f'%{keyword}%') + + # Combine all conditions + where_clause = " AND ".join( + search_conditions + keyword_conditions) if search_conditions or keyword_conditions else "1=1" + + # Complete the query + query = f''' + SELECT DISTINCT Media.url, Media.title, Media.type, Media.content, Media.author, Media.ingestion_date, Media.prompt, Media.summary + FROM Media + WHERE {where_clause} + LIMIT ? OFFSET ? + ''' + params.extend([results_per_page, offset]) + + cursor.execute(query, params) + results = cursor.fetchall() + + return results + + +# Gradio function to handle user input and display results with pagination, with better feedback +def search_and_display(search_query, search_fields, keywords, page): + results = search_db(search_query, search_fields, keywords, page) + + if isinstance(results, pd.DataFrame): + # Convert DataFrame to a list of tuples or lists + processed_results = results.values.tolist() # This converts DataFrame rows to lists + elif isinstance(results, list): + # Ensure that each element in the list is itself a list or tuple (not a dictionary) + processed_results = [list(item.values()) if isinstance(item, dict) else item for item in results] + else: + raise TypeError("Unsupported data type for results") + + return processed_results + + +def display_details(index, results): + if index is None or results is None: + return "Please select a result to view details." + + try: + # Ensure the index is an integer and access the row properly + index = int(index) + if isinstance(results, pd.DataFrame): + if index >= len(results): + return "Index out of range. Please select a valid index." + selected_row = results.iloc[index] + else: + # If results is not a DataFrame, but a list (assuming list of dicts) + selected_row = results[index] + except ValueError: + return "Index must be an integer." + except IndexError: + return "Index out of range. Please select a valid index." + + # Build HTML output safely + details_html = f""" +

{selected_row.get('Title', 'No Title')}

+

URL: {selected_row.get('URL', 'No URL')}

+

Type: {selected_row.get('Type', 'No Type')}

+

Author: {selected_row.get('Author', 'No Author')}

+

Ingestion Date: {selected_row.get('Ingestion Date', 'No Date')}

+

Prompt: {selected_row.get('Prompt', 'No Prompt')}

+

Summary: {selected_row.get('Summary', 'No Summary')}

+

Content: {selected_row.get('Content', 'No Content')}

+ """ + return details_html + + +def get_details(index, dataframe): + if index is None or dataframe is None or index >= len(dataframe): + return "Please select a result to view details." + row = dataframe.iloc[index] + details = f""" +

{row['Title']}

+

URL: {row['URL']}

+

Type: {row['Type']}

+

Author: {row['Author']}

+

Ingestion Date: {row['Ingestion Date']}

+

Prompt: {row['Prompt']}

+

Summary: {row['Summary']}

+

Content:

+
{row['Content']}
+ """ + return details + + +def format_results(results): + if not results: + return pd.DataFrame(columns=['URL', 'Title', 'Type', 'Content', 'Author', 'Ingestion Date', 'Prompt', 'Summary']) + + df = pd.DataFrame(results, columns=['URL', 'Title', 'Type', 'Content', 'Author', 'Ingestion Date', 'Prompt', 'Summary']) + logging.debug(f"Formatted DataFrame: {df}") + + return df + +# Function to export search results to CSV with pagination +def export_to_csv(search_query: str, search_fields: List[str], keyword: str, page: int = 1, results_per_file: int = 1000): + try: + results = search_db(search_query, search_fields, keyword, page, results_per_file) + df = format_results(results) + filename = f'search_results_page_{page}.csv' + df.to_csv(filename, index=False) + return f"Results exported to {filename}" + except (DatabaseError, InputError) as e: + return str(e) + + +# Helper function to validate URL format +def is_valid_url(url: str) -> bool: + regex = re.compile( + r'^(?:http|ftp)s?://' # http:// or https:// + r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain... + r'localhost|' # localhost... + r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4 + r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6 + r'(?::\d+)?' # optional port + r'(?:/?|[/?]\S+)$', re.IGNORECASE) + return re.match(regex, url) is not None + + +# Helper function to validate date format +def is_valid_date(date_string: str) -> bool: + try: + datetime.strptime(date_string, '%Y-%m-%d') + return True + except ValueError: + return False + +# +# +####################################################################################################################### + + + + +####################################################################################################################### +# Functions to manage prompts DB +# + +def create_prompts_db(): + conn = sqlite3.connect('prompts.db') + cursor = conn.cursor() + cursor.execute(''' + CREATE TABLE IF NOT EXISTS Prompts ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT NOT NULL UNIQUE, + details TEXT, + system TEXT, + user TEXT + ) + ''') + conn.commit() + conn.close() + +create_prompts_db() + + +def add_prompt(name, details, system, user=None): + try: + conn = sqlite3.connect('prompts.db') + cursor = conn.cursor() + cursor.execute(''' + INSERT INTO Prompts (name, details, system, user) + VALUES (?, ?, ?, ?) + ''', (name, details, system, user)) + conn.commit() + conn.close() + return "Prompt added successfully." + except sqlite3.IntegrityError: + return "Prompt with this name already exists." + except sqlite3.Error as e: + return f"Database error: {e}" + +def fetch_prompt_details(name): + conn = sqlite3.connect('prompts.db') + cursor = conn.cursor() + cursor.execute(''' + SELECT details, system, user + FROM Prompts + WHERE name = ? + ''', (name,)) + result = cursor.fetchone() + conn.close() + return result + +def list_prompts(): + conn = sqlite3.connect('prompts.db') + cursor = conn.cursor() + cursor.execute(''' + SELECT name + FROM Prompts + ''') + results = cursor.fetchall() + conn.close() + return [row[0] for row in results] + +def insert_prompt_to_db(title, description, system_prompt, user_prompt): + result = add_prompt(title, description, system_prompt, user_prompt) + return result + +# +# +####################################################################################################################### + + + + + + +####################################################################################################################### +# Function Definitions +# + +######### Words-per-second Chunking ######### +def chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]: + words = transcript.split() + words_per_chunk = chunk_duration * words_per_second + chunks = [' '.join(words[i:i + words_per_chunk]) for i in range(0, len(words), words_per_chunk)] + return chunks + + +def summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int, + words_per_second: int) -> str: + if api_name not in summarizers: # See 'summarizers' dict in the main script + return f"Unsupported API: {api_name}" + + summarizer = summarizers[api_name] + text = extract_text_from_segments(transcript) + chunks = chunk_transcript(text, chunk_duration, words_per_second) + + summaries = [] + for chunk in chunks: + if api_name == 'openai': + # Ensure the correct model and prompt are passed + summaries.append(summarizer(api_key, chunk, custom_prompt)) + else: + summaries.append(summarizer(api_key, chunk)) + + return "\n\n".join(summaries) + + +################## #################### + + +######### Token-size Chunking ######### FIXME - OpenAI only currently +# This is dirty and shameful and terrible. It should be replaced with a proper implementation. +# anyways lets get to it.... +openai_api_key = "Fake_key" # FIXME +client = OpenAI(api_key=openai_api_key) + + +def get_chat_completion(messages, model='gpt-4-turbo'): + response = client.chat.completions.create( + model=model, + messages=messages, + temperature=0, + ) + return response.choices[0].message.content + + +# This function chunks a text into smaller pieces based on a maximum token count and a delimiter +def chunk_on_delimiter(input_string: str, + max_tokens: int, + delimiter: str) -> List[str]: + chunks = input_string.split(delimiter) + combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum( + chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True) + if dropped_chunk_count > 0: + print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.") + combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks] + return combined_chunks + + +# This function combines text chunks into larger blocks without exceeding a specified token count. +# It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow. +def combine_chunks_with_no_minimum( + chunks: List[str], + max_tokens: int, + chunk_delimiter="\n\n", + header: Optional[str] = None, + add_ellipsis_for_overflow=False, +) -> Tuple[List[str], List[int]]: + dropped_chunk_count = 0 + output = [] # list to hold the final combined chunks + output_indices = [] # list to hold the indices of the final combined chunks + candidate = ( + [] if header is None else [header] + ) # list to hold the current combined chunk candidate + candidate_indices = [] + for chunk_i, chunk in enumerate(chunks): + chunk_with_header = [chunk] if header is None else [header, chunk] + # FIXME MAKE NOT OPENAI SPECIFIC + if len(openai_tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens: + print(f"warning: chunk overflow") + if ( + add_ellipsis_for_overflow + # FIXME MAKE NOT OPENAI SPECIFIC + and len(openai_tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens + ): + candidate.append("...") + dropped_chunk_count += 1 + continue # this case would break downstream assumptions + # estimate token count with the current chunk added + # FIXME MAKE NOT OPENAI SPECIFIC + extended_candidate_token_count = len(openai_tokenize(chunk_delimiter.join(candidate + [chunk]))) + # If the token count exceeds max_tokens, add the current candidate to output and start a new candidate + if extended_candidate_token_count > max_tokens: + output.append(chunk_delimiter.join(candidate)) + output_indices.append(candidate_indices) + candidate = chunk_with_header # re-initialize candidate + candidate_indices = [chunk_i] + # otherwise keep extending the candidate + else: + candidate.append(chunk) + candidate_indices.append(chunk_i) + # add the remaining candidate to output if it's not empty + if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0): + output.append(chunk_delimiter.join(candidate)) + output_indices.append(candidate_indices) + return output, output_indices, dropped_chunk_count + + +def rolling_summarize(text: str, + detail: float = 0, + model: str = 'gpt-4-turbo', + additional_instructions: Optional[str] = None, + minimum_chunk_size: Optional[int] = 500, + chunk_delimiter: str = ".", + summarize_recursively=False, + verbose=False): + """ + Summarizes a given text by splitting it into chunks, each of which is summarized individually. + The level of detail in the summary can be adjusted, and the process can optionally be made recursive. + + Parameters: - text (str): The text to be summarized. - detail (float, optional): A value between 0 and 1 + indicating the desired level of detail in the summary. 0 leads to a higher level summary, and 1 results in a more + detailed summary. Defaults to 0. - model (str, optional): The model to use for generating summaries. Defaults to + 'gpt-3.5-turbo'. - additional_instructions (Optional[str], optional): Additional instructions to provide to the + model for customizing summaries. - minimum_chunk_size (Optional[int], optional): The minimum size for text + chunks. Defaults to 500. - chunk_delimiter (str, optional): The delimiter used to split the text into chunks. + Defaults to ".". - summarize_recursively (bool, optional): If True, summaries are generated recursively, + using previous summaries for context. - verbose (bool, optional): If True, prints detailed information about the + chunking process. + + Returns: + - str: The final compiled summary of the text. + + The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count + based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If + `summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the + summarization process. The function returns a compiled summary of all chunks. + """ + + # check detail is set correctly + assert 0 <= detail <= 1 + + # interpolate the number of chunks based to get specified level of detail + max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter)) + min_chunks = 1 + num_chunks = int(min_chunks + detail * (max_chunks - min_chunks)) + + # adjust chunk_size based on interpolated number of chunks + # FIXME MAKE NOT OPENAI SPECIFIC + document_length = len(openai_tokenize(text)) + chunk_size = max(minimum_chunk_size, document_length // num_chunks) + text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter) + if verbose: + print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.") + # FIXME MAKE NOT OPENAI SPECIFIC + print(f"Chunk lengths are {[len(openai_tokenize(x)) for x in text_chunks]}") + + # set system message + system_message_content = "Rewrite this text in summarized form." + if additional_instructions is not None: + system_message_content += f"\n\n{additional_instructions}" + + accumulated_summaries = [] + for chunk in tqdm(text_chunks): + if summarize_recursively and accumulated_summaries: + # Creating a structured prompt for recursive summarization + accumulated_summaries_string = '\n\n'.join(accumulated_summaries) + user_message_content = f"Previous summaries:\n\n{accumulated_summaries_string}\n\nText to summarize next:\n\n{chunk}" + else: + # Directly passing the chunk for summarization without recursive context + user_message_content = chunk + + # Constructing messages based on whether recursive summarization is applied + messages = [ + {"role": "system", "content": system_message_content}, + {"role": "user", "content": user_message_content} + ] + + # Assuming this function gets the completion and works as expected + response = get_chat_completion(messages, model=model) + accumulated_summaries.append(response) + + # Compile final summary from partial summaries + global final_summary + final_summary = '\n\n'.join(accumulated_summaries) + + return final_summary + + +####################################### + + +######### Words-per-second Chunking ######### +# FIXME - WHole section needs to be re-written +def chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]: + words = transcript.split() + words_per_chunk = chunk_duration * words_per_second + chunks = [' '.join(words[i:i + words_per_chunk]) for i in range(0, len(words), words_per_chunk)] + return chunks + + +def summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int, + words_per_second: int) -> str: + if api_name not in summarizers: # See 'summarizers' dict in the main script + return f"Unsupported API: {api_name}" + + if not transcript: + logging.error("Empty or None transcript provided to summarize_chunks") + return "Error: Empty or None transcript provided" + + text = extract_text_from_segments(transcript) + chunks = chunk_transcript(text, chunk_duration, words_per_second) + + custom_prompt = args.custom_prompt + + summaries = [] + for chunk in chunks: + if api_name == 'openai': + # Ensure the correct model and prompt are passed + summaries.append(summarize_with_openai(api_key, chunk, custom_prompt)) + elif api_name == 'anthropic': + summaries.append(summarize_with_cohere(api_key, chunk, anthropic_model, custom_prompt)) + elif api_name == 'cohere': + summaries.append(summarize_with_claude(api_key, chunk, cohere_model, custom_prompt)) + elif api_name == 'groq': + summaries.append(summarize_with_groq(api_key, chunk, groq_model, custom_prompt)) + elif api_name == 'llama': + summaries.append(summarize_with_llama(llama_api_IP, chunk, api_key, custom_prompt)) + elif api_name == 'kobold': + summaries.append(summarize_with_kobold(kobold_api_IP, chunk, api_key, custom_prompt)) + elif api_name == 'ooba': + summaries.append(summarize_with_oobabooga(ooba_api_IP, chunk, api_key, custom_prompt)) + elif api_name == 'tabbyapi': + summaries.append(summarize_with_vllm(api_key, tabby_api_IP, chunk, summarize.llm_model, custom_prompt)) + elif api_name == 'local-llm': + summaries.append(summarize_with_local_llm(chunk, custom_prompt)) + else: + return f"Unsupported API: {api_name}" + + return "\n\n".join(summaries) + +# FIXME - WHole section needs to be re-written +def summarize_with_detail_openai(text, detail, verbose=False): + summary_with_detail_variable = rolling_summarize(text, detail=detail, verbose=True) + print(len(openai_tokenize(summary_with_detail_variable))) + return summary_with_detail_variable + + +def summarize_with_detail_recursive_openai(text, detail, verbose=False): + summary_with_recursive_summarization = rolling_summarize(text, detail=detail, summarize_recursively=True) + print(summary_with_recursive_summarization) + +# +# +################################################################################# + + + +# Read configuration from file +config = configparser.ConfigParser() +config.read('../config.txt') + +# Local-Models +kobold_api_IP = config.get('Local-API', 'kobold_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate') +kobold_api_key = config.get('Local-API', 'kobold_api_key', fallback='') + +llama_api_IP = config.get('Local-API', 'llama_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions') +llama_api_key = config.get('Local-API', 'llama_api_key', fallback='') + +ooba_api_IP = config.get('Local-API', 'ooba_api_IP', fallback='http://127.0.0.1:5000/v1/chat/completions') +ooba_api_key = config.get('Local-API', 'ooba_api_key', fallback='') + +tabby_api_IP = config.get('Local-API', 'tabby_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate') +tabby_api_key = config.get('Local-API', 'tabby_api_key', fallback=None) + +vllm_api_url = config.get('Local-API', 'vllm_api_IP', fallback='http://127.0.0.1:500/api/v1/chat/completions') +vllm_api_key = config.get('Local-API', 'vllm_api_key', fallback=None) + +####################################################################################################################### +# Function Definitions +# + +def summarize_with_local_llm(file_path, custom_prompt_arg): + try: + logging.debug("Local LLM: Loading json data for summarization") + with open(file_path, 'r') as file: + segments = json.load(file) + + logging.debug("Local LLM: Extracting text from the segments") + text = extract_text_from_segments(segments) + + headers = { + 'Content-Type': 'application/json' + } + + logging.debug("Local LLM: Preparing data + prompt for submittal") + local_llm_prompt = f"{text} \n\n\n\n{custom_prompt_arg}" + data = { + "messages": [ + { + "role": "system", + "content": "You are a professional summarizer." + }, + { + "role": "user", + "content": local_llm_prompt + } + ], + "max_tokens": 28000, # Adjust tokens as needed + } + logging.debug("Local LLM: Posting request") + response = requests.post('http://127.0.0.1:8080/v1/chat/completions', headers=headers, json=data) + + if response.status_code == 200: + response_data = response.json() + if 'choices' in response_data and len(response_data['choices']) > 0: + summary = response_data['choices'][0]['message']['content'].strip() + logging.debug("Local LLM: Summarization successful") + print("Local LLM: Summarization successful.") + return summary + else: + logging.warning("Local LLM: Summary not found in the response data") + return "Local LLM: Summary not available" + else: + logging.debug("Local LLM: Summarization failed") + print("Local LLM: Failed to process summary:", response.text) + return "Local LLM: Failed to process summary" + except Exception as e: + logging.debug("Local LLM: Error in processing: %s", str(e)) + print("Error occurred while processing summary with Local LLM:", str(e)) + return "Local LLM: Error occurred while processing summary" + +def summarize_with_llama(api_url, file_path, token, custom_prompt): + try: + logging.debug("llama: Loading JSON data") + with open(file_path, 'r') as file: + segments = json.load(file) + + logging.debug(f"llama: Extracting text from segments file") + text = extract_text_from_segments(segments) # Define this function to extract text properly + + headers = { + 'accept': 'application/json', + 'content-type': 'application/json', + } + if len(token) > 5: + headers['Authorization'] = f'Bearer {token}' + + llama_prompt = f"{text} \n\n\n\n{custom_prompt}" + logging.debug("llama: Prompt being sent is {llama_prompt}") + + data = { + "prompt": llama_prompt + } + + logging.debug("llama: Submitting request to API endpoint") + print("llama: Submitting request to API endpoint") + response = requests.post(api_url, headers=headers, json=data) + response_data = response.json() + logging.debug("API Response Data: %s", response_data) + + if response.status_code == 200: + # if 'X' in response_data: + logging.debug(response_data) + summary = response_data['content'].strip() + logging.debug("llama: Summarization successful") + print("Summarization successful.") + return summary + else: + logging.error(f"llama: API request failed with status code {response.status_code}: {response.text}") + return f"llama: API request failed: {response.text}" + + except Exception as e: + logging.error("llama: Error in processing: %s", str(e)) + return f"llama: Error occurred while processing summary with llama: {str(e)}" + + +# https://lite.koboldai.net/koboldcpp_api#/api%2Fv1/post_api_v1_generate +def summarize_with_kobold(api_url, file_path, kobold_api_token, custom_prompt): + try: + logging.debug("kobold: Loading JSON data") + with open(file_path, 'r') as file: + segments = json.load(file) + + logging.debug(f"kobold: Extracting text from segments file") + text = extract_text_from_segments(segments) + + headers = { + 'accept': 'application/json', + 'content-type': 'application/json', + } + + kobold_prompt = f"{text} \n\n\n\n{custom_prompt}" + logging.debug("kobold: Prompt being sent is {kobold_prompt}") + + # FIXME + # Values literally c/p from the api docs.... + data = { + "max_context_length": 8096, + "max_length": 4096, + "prompt": f"{text}\n\n\n\n{custom_prompt}" + } + + logging.debug("kobold: Submitting request to API endpoint") + print("kobold: Submitting request to API endpoint") + response = requests.post(api_url, headers=headers, json=data) + response_data = response.json() + logging.debug("kobold: API Response Data: %s", response_data) + + if response.status_code == 200: + if 'results' in response_data and len(response_data['results']) > 0: + summary = response_data['results'][0]['text'].strip() + logging.debug("kobold: Summarization successful") + print("Summarization successful.") + save_summary_to_file(summary, file_path) # Save the summary to a file + return summary + else: + logging.error("Expected data not found in API response.") + return "Expected data not found in API response." + else: + logging.error(f"kobold: API request failed with status code {response.status_code}: {response.text}") + return f"kobold: API request failed: {response.text}" + + except Exception as e: + logging.error("kobold: Error in processing: %s", str(e)) + return f"kobold: Error occurred while processing summary with kobold: {str(e)}" + + +# https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API +def summarize_with_oobabooga(api_url, file_path, ooba_api_token, custom_prompt): + try: + logging.debug("ooba: Loading JSON data") + with open(file_path, 'r') as file: + segments = json.load(file) + + logging.debug(f"ooba: Extracting text from segments file\n\n\n") + text = extract_text_from_segments(segments) + logging.debug(f"ooba: Finished extracting text from segments file") + + headers = { + 'accept': 'application/json', + 'content-type': 'application/json', + } + + # prompt_text = "I like to eat cake and bake cakes. I am a baker. I work in a French bakery baking cakes. It + # is a fun job. I have been baking cakes for ten years. I also bake lots of other baked goods, but cakes are + # my favorite." prompt_text += f"\n\n{text}" # Uncomment this line if you want to include the text variable + ooba_prompt = f"{text}" + f"\n\n\n\n{custom_prompt}" + logging.debug("ooba: Prompt being sent is {ooba_prompt}") + + data = { + "mode": "chat", + "character": "Example", + "messages": [{"role": "user", "content": ooba_prompt}] + } + + logging.debug("ooba: Submitting request to API endpoint") + print("ooba: Submitting request to API endpoint") + response = requests.post(api_url, headers=headers, json=data, verify=False) + logging.debug("ooba: API Response Data: %s", response) + + if response.status_code == 200: + response_data = response.json() + summary = response.json()['choices'][0]['message']['content'] + logging.debug("ooba: Summarization successful") + print("Summarization successful.") + return summary + else: + logging.error(f"oobabooga: API request failed with status code {response.status_code}: {response.text}") + return f"ooba: API request failed with status code {response.status_code}: {response.text}" + + except Exception as e: + logging.error("ooba: Error in processing: %s", str(e)) + return f"ooba: Error occurred while processing summary with oobabooga: {str(e)}" + + +# FIXME - https://docs.vllm.ai/en/latest/getting_started/quickstart.html .... Great docs. +def summarize_with_vllm(vllm_api_url, vllm_api_key_function_arg, llm_model, text, vllm_custom_prompt_function_arg): + vllm_client = OpenAI( + base_url=vllm_api_url, + api_key=vllm_api_key_function_arg + ) + + custom_prompt = vllm_custom_prompt_function_arg + + completion = client.chat.completions.create( + model=llm_model, + messages=[ + {"role": "system", "content": "You are a professional summarizer."}, + {"role": "user", "content": f"{text} \n\n\n\n{custom_prompt}"} + ] + ) + vllm_summary = completion.choices[0].message.content + return vllm_summary + + +# FIXME - Install is more trouble than care to deal with right now. +def summarize_with_tabbyapi(tabby_api_key, tabby_api_IP, text, tabby_model, custom_prompt): + model = tabby_model + headers = { + 'Authorization': f'Bearer {tabby_api_key}', + 'Content-Type': 'application/json' + } + data = { + 'text': text, + 'model': 'tabby' # Specify the model if needed + } + try: + response = requests.post('https://api.tabbyapi.com/summarize', headers=headers, json=data) + response.raise_for_status() + summary = response.json().get('summary', '') + return summary + except requests.exceptions.RequestException as e: + logger.error(f"Error summarizing with TabbyAPI: {e}") + return "Error summarizing with TabbyAPI." + + +def save_summary_to_file(summary, file_path): + logging.debug("Now saving summary to file...") + base_name = os.path.splitext(os.path.basename(file_path))[0] + summary_file_path = os.path.join(os.path.dirname(file_path), base_name + '_summary.txt') + os.makedirs(os.path.dirname(summary_file_path), exist_ok=True) + logging.debug("Opening summary file for writing, *segments.json with *_summary.txt") + with open(summary_file_path, 'w') as file: + file.write(summary) + logging.info(f"Summary saved to file: {summary_file_path}") + +# From Video_DL_Ingestion_Lib.py +# def save_summary_to_file(summary: str, file_path: str): +# """Save summary to a JSON file.""" +# summary_data = {'summary': summary, 'generated_at': datetime.now().isoformat()} +# with open(file_path, 'w') as file: +# json.dump(summary_data, file, indent=4) + + +# +# +####################################################################################################################### + + + + + + +####################################################################################################################### +# Function Definitions +# + +# Download latest llamafile from Github + # Example usage + #repo = "Mozilla-Ocho/llamafile" + #asset_name_prefix = "llamafile-" + #output_filename = "llamafile" + #download_latest_llamafile(repo, asset_name_prefix, output_filename) +def download_latest_llamafile(repo, asset_name_prefix, output_filename): + # Check if the file already exists + print("Checking for and downloading Llamafile it it doesn't already exist...") + if os.path.exists(output_filename): + print("Llamafile already exists. Skipping download.") + logging.debug(f"{output_filename} already exists. Skipping download.") + llamafile_exists = True + else: + llamafile_exists = False + + if llamafile_exists == True: + pass + else: + # Get the latest release information + latest_release_url = f"https://api.github.com/repos/{repo}/releases/latest" + response = requests.get(latest_release_url) + if response.status_code != 200: + raise Exception(f"Failed to fetch latest release info: {response.status_code}") + + latest_release_data = response.json() + tag_name = latest_release_data['tag_name'] + + # Get the release details using the tag name + release_details_url = f"https://api.github.com/repos/{repo}/releases/tags/{tag_name}" + response = requests.get(release_details_url) + if response.status_code != 200: + raise Exception(f"Failed to fetch release details for tag {tag_name}: {response.status_code}") + + release_data = response.json() + assets = release_data.get('assets', []) + + # Find the asset with the specified prefix + asset_url = None + for asset in assets: + if re.match(f"{asset_name_prefix}.*", asset['name']): + asset_url = asset['browser_download_url'] + break + + if not asset_url: + raise Exception(f"No asset found with prefix {asset_name_prefix}") + + # Download the asset + response = requests.get(asset_url) + if response.status_code != 200: + raise Exception(f"Failed to download asset: {response.status_code}") + + print("Llamafile downloaded successfully.") + logging.debug("Main: Llamafile downloaded successfully.") + + # Save the file + with open(output_filename, 'wb') as file: + file.write(response.content) + + logging.debug(f"Downloaded {output_filename} from {asset_url}") + print(f"Downloaded {output_filename} from {asset_url}") + + # Check to see if the LLM already exists, and if not, download the LLM + print("Checking for and downloading LLM from Huggingface if needed...") + logging.debug("Main: Checking and downloading LLM from Huggingface if needed...") + mistral_7b_instruct_v0_2_q8_0_llamafile = "mistral-7b-instruct-v0.2.Q8_0.llamafile" + Samantha_Mistral_Instruct_7B_Bulleted_Notes_Q8 = "samantha-mistral-instruct-7b-bulleted-notes.Q8_0.gguf" + Phi_3_mini_128k_instruct_Q8_0_gguf = "Phi-3-mini-128k-instruct-Q8_0.gguf" + if os.path.exists(mistral_7b_instruct_v0_2_q8_0_llamafile): + llamafile_llm_url = "https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q8_0.llamafile?download=true" + elif os.path.exists(Samantha_Mistral_Instruct_7B_Bulleted_Notes_Q8): + llamafile_llm_url = "https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q8_0.llamafile?download=true" + print("Model is already downloaded. Skipping download.") + pass + else: + logging.debug("Main: Checking and downloading LLM from Huggingface if needed...") + print("Downloading LLM from Huggingface...") + time.sleep(1) + print("Gonna be a bit...") + time.sleep(1) + print("Like seriously, an 8GB file...") + time.sleep(2) + dl_check = input("Final chance to back out, hit 'N'/'n' to cancel, or 'Y'/'y' to continue: ") + if dl_check == "N" or dl_check == "n": + exit() + else: + print("Downloading LLM from Huggingface...") + # Establish hash values for LLM models + mistral_7b_instruct_v0_2_q8_gguf_sha256 = "f326f5f4f137f3ad30f8c9cc21d4d39e54476583e8306ee2931d5a022cb85b06" + samantha_mistral_instruct_7b_bulleted_notes_q8_0_gguf_sha256 = "6334c1ab56c565afd86535271fab52b03e67a5e31376946bce7bf5c144e847e4" + mistral_7b_instruct_v0_2_q8_0_llamafile_sha256 = "1ee6114517d2f770425c880e5abc443da36b193c82abec8e2885dd7ce3b9bfa6" + global llm_choice + + # FIXME - llm_choice + llm_choice = 2 + llm_choice = input("Which LLM model would you like to download? 1. Mistral-7B-Instruct-v0.2-GGUF or 2. Samantha-Mistral-Instruct-7B-Bulleted-Notes) (plain or 'custom') or MS Flavor: Phi-3-mini-128k-instruct-Q8_0.gguf \n\n\tPress '1' or '2' or '3' to specify: ") + while llm_choice != "1" and llm_choice != "2" and llm_choice != "3": + print("Invalid choice. Please try again.") + if llm_choice == "1": + llm_download_model = "Mistral-7B-Instruct-v0.2-Q8.llamafile" + mistral_7b_instruct_v0_2_q8_0_llamafile_sha256 = "1ee6114517d2f770425c880e5abc443da36b193c82abec8e2885dd7ce3b9bfa6" + llm_download_model_hash = mistral_7b_instruct_v0_2_q8_0_llamafile_sha256 + llamafile_llm_url = "https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q8_0.llamafile?download=true" + llamafile_llm_output_filename = "mistral-7b-instruct-v0.2.Q8_0.llamafile" + download_file(llamafile_llm_url, llamafile_llm_output_filename, llm_download_model_hash) + elif llm_choice == "2": + llm_download_model = "Samantha-Mistral-Instruct-7B-Bulleted-Notes-Q8.gguf" + samantha_mistral_instruct_7b_bulleted_notes_q8_0_gguf_sha256 = "6334c1ab56c565afd86535271fab52b03e67a5e31376946bce7bf5c144e847e4" + llm_download_model_hash = samantha_mistral_instruct_7b_bulleted_notes_q8_0_gguf_sha256 + llamafile_llm_output_filename = "samantha-mistral-instruct-7b-bulleted-notes.Q8_0.gguf" + llamafile_llm_url = "https://huggingface.co/cognitivetech/samantha-mistral-instruct-7b-bulleted-notes-GGUF/resolve/main/samantha-mistral-instruct-7b-bulleted-notes.Q8_0.gguf?download=true" + download_file(llamafile_llm_url, llamafile_llm_output_filename, llm_download_model_hash) + elif llm_choice == "3": + llm_download_model = "Phi-3-mini-128k-instruct-Q8_0.gguf" + Phi_3_mini_128k_instruct_Q8_0_gguf_sha256 = "6817b66d1c3c59ab06822e9732f0e594eea44e64cae2110906eac9d17f75d193" + llm_download_model_hash = Phi_3_mini_128k_instruct_Q8_0_gguf_sha256 + llamafile_llm_output_filename = "Phi-3-mini-128k-instruct-Q8_0.gguf" + llamafile_llm_url = "https://huggingface.co/gaianet/Phi-3-mini-128k-instruct-GGUF/resolve/main/Phi-3-mini-128k-instruct-Q8_0.gguf?download=true" + download_file(llamafile_llm_url, llamafile_llm_output_filename, llm_download_model_hash) + elif llm_choice == "4": # FIXME - and meta_Llama_3_8B_Instruct_Q8_0_llamafile_exists == False: + meta_Llama_3_8B_Instruct_Q8_0_llamafile_sha256 = "406868a97f02f57183716c7e4441d427f223fdbc7fa42964ef10c4d60dd8ed37" + llm_download_model_hash = meta_Llama_3_8B_Instruct_Q8_0_llamafile_sha256 + llamafile_llm_output_filename = " Meta-Llama-3-8B-Instruct.Q8_0.llamafile" + llamafile_llm_url = "https://huggingface.co/Mozilla/Meta-Llama-3-8B-Instruct-llamafile/resolve/main/Meta-Llama-3-8B-Instruct.Q8_0.llamafile?download=true" + else: + print("Invalid choice. Please try again.") + return output_filename + + +def download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5): + temp_path = dest_path + '.tmp' + + for attempt in range(max_retries): + try: + # Check if a partial download exists and get its size + resume_header = {} + if os.path.exists(temp_path): + resume_header = {'Range': f'bytes={os.path.getsize(temp_path)}-'} + + response = requests.get(url, stream=True, headers=resume_header) + response.raise_for_status() + + # Get the total file size from headers + total_size = int(response.headers.get('content-length', 0)) + initial_pos = os.path.getsize(temp_path) if os.path.exists(temp_path) else 0 + + mode = 'ab' if 'Range' in response.headers else 'wb' + with open(temp_path, mode) as temp_file, tqdm( + total=total_size, unit='B', unit_scale=True, desc=dest_path, initial=initial_pos, ascii=True + ) as pbar: + for chunk in response.iter_content(chunk_size=8192): + if chunk: # filter out keep-alive new chunks + temp_file.write(chunk) + pbar.update(len(chunk)) + + # Verify the checksum if provided + if expected_checksum: + if not verify_checksum(temp_path, expected_checksum): + os.remove(temp_path) + raise ValueError("Downloaded file's checksum does not match the expected checksum") + + # Move the file to the final destination + os.rename(temp_path, dest_path) + print("Download complete and verified!") + return dest_path + + except Exception as e: + print(f"Attempt {attempt + 1} failed: {e}") + if attempt < max_retries - 1: + print(f"Retrying in {delay} seconds...") + time.sleep(delay) + else: + print("Max retries reached. Download failed.") + raise + +# FIXME / IMPLEMENT FULLY +# File download verification +#mistral_7b_llamafile_instruct_v02_q8_url = "https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q8_0.llamafile?download=true" +#global mistral_7b_instruct_v0_2_q8_0_llamafile_sha256 +#mistral_7b_instruct_v0_2_q8_0_llamafile_sha256 = "1ee6114517d2f770425c880e5abc443da36b193c82abec8e2885dd7ce3b9bfa6" + +#mistral_7b_v02_instruct_model_q8_gguf_url = "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q8_0.gguf?download=true" +#global mistral_7b_instruct_v0_2_q8_gguf_sha256 +#mistral_7b_instruct_v0_2_q8_gguf_sha256 = "f326f5f4f137f3ad30f8c9cc21d4d39e54476583e8306ee2931d5a022cb85b06" + +#samantha_instruct_model_q8_gguf_url = "https://huggingface.co/cognitivetech/samantha-mistral-instruct-7b_bulleted-notes_GGUF/resolve/main/samantha-mistral-instruct-7b-bulleted-notes.Q8_0.gguf?download=true" +#global samantha_mistral_instruct_7b_bulleted_notes_q8_0_gguf_sha256 +#samantha_mistral_instruct_7b_bulleted_notes_q8_0_gguf_sha256 = "6334c1ab56c565afd86535271fab52b03e67a5e31376946bce7bf5c144e847e4" + + + +def verify_checksum(file_path, expected_checksum): + sha256_hash = hashlib.sha256() + with open(file_path, 'rb') as f: + for byte_block in iter(lambda: f.read(4096), b''): + sha256_hash.update(byte_block) + return sha256_hash.hexdigest() == expected_checksum + +process = None +# Function to close out llamafile process on script exit. +def cleanup_process(): + global process + if process is not None: + process.kill() + logging.debug("Main: Terminated the external process") + + +def signal_handler(sig, frame): + logging.info('Signal handler called with signal: %s', sig) + cleanup_process() + sys.exit(0) + + +# FIXME - Add callout to gradio UI +def local_llm_function(): + global process + repo = "Mozilla-Ocho/llamafile" + asset_name_prefix = "llamafile-" + useros = os.name + if useros == "nt": + output_filename = "llamafile.exe" + else: + output_filename = "llamafile" + print( + "WARNING - Checking for existence of llamafile and HuggingFace model, downloading if needed...This could be a while") + print("WARNING - and I mean a while. We're talking an 8 Gigabyte model here...") + print("WARNING - Hope you're comfy. Or it's already downloaded.") + time.sleep(6) + logging.debug("Main: Checking and downloading Llamafile from Github if needed...") + llamafile_path = download_latest_llamafile(repo, asset_name_prefix, output_filename) + logging.debug("Main: Llamafile downloaded successfully.") + + # FIXME - llm_choice + global llm_choice + llm_choice = 1 + # Launch the llamafile in an external process with the specified argument + if llm_choice == 1: + arguments = ["--ctx-size", "8192 ", " -m", "mistral-7b-instruct-v0.2.Q8_0.llamafile"] + elif llm_choice == 2: + arguments = ["--ctx-size", "8192 ", " -m", "samantha-mistral-instruct-7b-bulleted-notes.Q8_0.gguf"] + elif llm_choice == 3: + arguments = ["--ctx-size", "8192 ", " -m", "Phi-3-mini-128k-instruct-Q8_0.gguf"] + elif llm_choice == 4: + arguments = ["--ctx-size", "8192 ", " -m", "llama-3"] # FIXME + + try: + logging.info("Main: Launching the LLM (llamafile) in an external terminal window...") + if useros == "nt": + launch_in_new_terminal_windows(llamafile_path, arguments) + elif useros == "posix": + launch_in_new_terminal_linux(llamafile_path, arguments) + else: + launch_in_new_terminal_mac(llamafile_path, arguments) + # FIXME - pid doesn't exist in this context + #logging.info(f"Main: Launched the {llamafile_path} with PID {process.pid}") + atexit.register(cleanup_process, process) + except Exception as e: + logging.error(f"Failed to launch the process: {e}") + print(f"Failed to launch the process: {e}") + + +def local_llm_gui_function(prompt, temperature, top_k, top_p, min_p, stream, stop, typical_p, repeat_penalty, repeat_last_n, + penalize_nl, presence_penalty, frequency_penalty, penalty_prompt, ignore_eos, system_prompt): + repo = "Mozilla-Ocho/llamafile" + asset_name_prefix = "llamafile-" + useros = os.name + if useros == "nt": + output_filename = "llamafile.exe" + else: + output_filename = "llamafile" + print( + "WARNING - Checking for existence of llamafile and HuggingFace model, downloading if needed...This could be a while") + print("WARNING - and I mean a while. We're talking an 8 Gigabyte model here...") + print("WARNING - Hope you're comfy. Or it's already downloaded.") + time.sleep(6) + logging.debug("Main: Checking and downloading Llamafile from Github if needed...") + llamafile_path = download_latest_llamafile(repo, asset_name_prefix, output_filename) + logging.debug("Main: Llamafile downloaded successfully.") + + # FIXME - llm_choice + global llm_choice + llm_choice = 1 + # Launch the llamafile in an external process with the specified argument + if llm_choice == 1: + arguments = ["--ctx-size", "8192 ", " -m", "mistral-7b-instruct-v0.2.Q8_0.llamafile"] + elif llm_choice == 2: + arguments = ["--ctx-size", "8192 ", " -m", "samantha-mistral-instruct-7b-bulleted-notes.Q8_0.gguf"] + elif llm_choice == 3: + arguments = ["--ctx-size", "8192 ", " -m", "Phi-3-mini-128k-instruct-Q8_0.gguf"] + elif llm_choice == 4: + arguments = ["--ctx-size", "8192 ", " -m", "llama-3"] # FIXME + + try: + logging.info("Main: Launching the LLM (llamafile) in an external terminal window...") + if useros == "nt": + launch_in_new_terminal_windows(llamafile_path, arguments) + elif useros == "posix": + launch_in_new_terminal_linux(llamafile_path, arguments) + else: + launch_in_new_terminal_mac(llamafile_path, arguments) + # FIXME - pid doesn't exist in this context + #logging.info(f"Main: Launched the {llamafile_path} with PID {process.pid}") + atexit.register(cleanup_process, process) + except Exception as e: + logging.error(f"Failed to launch the process: {e}") + print(f"Failed to launch the process: {e}") + + + + +# Launch the executable in a new terminal window # FIXME - really should figure out a cleaner way of doing this... +def launch_in_new_terminal_windows(executable, args): + command = f'start cmd /k "{executable} {" ".join(args)}"' + subprocess.Popen(command, shell=True) + + +# FIXME +def launch_in_new_terminal_linux(executable, args): + command = f'gnome-terminal -- {executable} {" ".join(args)}' + subprocess.Popen(command, shell=True) + + +# FIXME +def launch_in_new_terminal_mac(executable, args): + command = f'open -a Terminal.app {executable} {" ".join(args)}' + subprocess.Popen(command, shell=True) + + +####################################################################################################################### +# Function Definitions +# + +def read_paths_from_file(file_path): + """ Reads a file containing URLs or local file paths and returns them as a list. """ + paths = [] # Initialize paths as an empty list + with open(file_path, 'r') as file: + paths = [line.strip() for line in file] + return paths + + +def process_path(path): + """ Decides whether the path is a URL or a local file and processes accordingly. """ + if path.startswith('http'): + logging.debug("file is a URL") + # For YouTube URLs, modify to download and extract info + return get_youtube(path) + elif os.path.exists(path): + logging.debug("File is a path") + # For local files, define a function to handle them + return process_local_file(path) + else: + logging.error(f"Path does not exist: {path}") + return None + + +# FIXME +def process_local_file(file_path): + logging.info(f"Processing local file: {file_path}") + title = normalize_title(os.path.splitext(os.path.basename(file_path))[0]) + info_dict = {'title': title} + logging.debug(f"Creating {title} directory...") + download_path = create_download_directory(title) + logging.debug(f"Converting '{title}' to an audio file (wav).") + audio_file = convert_to_wav(file_path) # Assumes input files are videos needing audio extraction + logging.debug(f"'{title}' successfully converted to an audio file (wav).") + return download_path, info_dict, audio_file + + +def read_paths_from_file(file_path: str) -> List[str]: + """Read paths from a text file.""" + with open(file_path, 'r') as file: + paths = file.readlines() + return [path.strip() for path in paths] + + +# +# +####################################################################################################################### + # # #######################################################################################################################