Spaces:
Sleeping
Sleeping
Delete whalecore
Browse files- whalecore/__init__.py +0 -0
- whalecore/agents.py +0 -36
- whalecore/parser.py +0 -49
- whalecore/rag.py +0 -37
whalecore/__init__.py
DELETED
|
File without changes
|
whalecore/agents.py
DELETED
|
@@ -1,36 +0,0 @@
|
|
| 1 |
-
import yaml
|
| 2 |
-
|
| 3 |
-
class Agent:
|
| 4 |
-
def __init__(self, name, persona, instructions):
|
| 5 |
-
self.name = name
|
| 6 |
-
self.persona = persona
|
| 7 |
-
self.instructions = instructions
|
| 8 |
-
|
| 9 |
-
def chat(self, message):
|
| 10 |
-
# Placeholder logic — replace with real LLM call later
|
| 11 |
-
return f"🧠 {self.name} says:\n{self.instructions}\n\n{self.persona}\n\nYou said: {message[:260]}..."
|
| 12 |
-
|
| 13 |
-
def load_agents(config_path="config.yaml"):
|
| 14 |
-
with open(config_path, 'r') as f:
|
| 15 |
-
config = yaml.safe_load(f)
|
| 16 |
-
|
| 17 |
-
assert isinstance(config, dict), "YAML must contain a top-level 'agents:' key"
|
| 18 |
-
assert 'agents' in config, "Missing 'agents' key in YAML file"
|
| 19 |
-
|
| 20 |
-
print("🧠 YAML loaded successfully:", config)
|
| 21 |
-
|
| 22 |
-
agents = []
|
| 23 |
-
for agent_conf in config['agents']:
|
| 24 |
-
agent = Agent(
|
| 25 |
-
name=agent_conf['name'],
|
| 26 |
-
persona=agent_conf['persona'],
|
| 27 |
-
instructions=agent_conf['instructions']
|
| 28 |
-
)
|
| 29 |
-
agents.append(agent)
|
| 30 |
-
return agents
|
| 31 |
-
|
| 32 |
-
def run_agents_on_text(agent_list, text):
|
| 33 |
-
results = {}
|
| 34 |
-
for agent in agent_list:
|
| 35 |
-
results[agent.name] = agent.chat(text)
|
| 36 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
whalecore/parser.py
DELETED
|
@@ -1,49 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import PyPDF2
|
| 3 |
-
import whisper
|
| 4 |
-
from pydub import AudioSegment
|
| 5 |
-
from sentence_transformers import SentenceTransformer
|
| 6 |
-
|
| 7 |
-
import warnings
|
| 8 |
-
warnings.filterwarnings(
|
| 9 |
-
"ignore",
|
| 10 |
-
category=FutureWarning,
|
| 11 |
-
message="`clean_up_tokenization_spaces` was not set.*"
|
| 12 |
-
)
|
| 13 |
-
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 14 |
-
|
| 15 |
-
def parse_pdf(filepath):
|
| 16 |
-
text = ""
|
| 17 |
-
with open(filepath, 'rb') as f:
|
| 18 |
-
reader = PyPDF2.PdfReader(f)
|
| 19 |
-
for page in reader.pages:
|
| 20 |
-
text += page.extract_text() + "\n"
|
| 21 |
-
return text
|
| 22 |
-
|
| 23 |
-
def parse_audio(filepath):
|
| 24 |
-
model = whisper.load_model("base")
|
| 25 |
-
result = model.transcribe(filepath)
|
| 26 |
-
return result['text']
|
| 27 |
-
|
| 28 |
-
def parse_text(filepath):
|
| 29 |
-
with open(filepath, 'r') as f:
|
| 30 |
-
return f.read()
|
| 31 |
-
|
| 32 |
-
def parse_file(filepath):
|
| 33 |
-
if filepath.endswith('.pdf'):
|
| 34 |
-
return parse_pdf(filepath)
|
| 35 |
-
elif filepath.endswith(('.mp3', '.wav', '.m4a')):
|
| 36 |
-
return parse_audio(filepath)
|
| 37 |
-
elif filepath.endswith('.txt'):
|
| 38 |
-
return parse_text(filepath)
|
| 39 |
-
else:
|
| 40 |
-
raise ValueError(f"Unsupported file type: {filepath}")
|
| 41 |
-
|
| 42 |
-
def chunk_text(text, chunk_size=300):
|
| 43 |
-
words = text.split()
|
| 44 |
-
return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
|
| 45 |
-
|
| 46 |
-
def chunk_and_embed(text):
|
| 47 |
-
chunks = chunk_text(text)
|
| 48 |
-
embeddings = model.encode(chunks).tolist()
|
| 49 |
-
return list(zip(chunks, embeddings))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
whalecore/rag.py
DELETED
|
@@ -1,37 +0,0 @@
|
|
| 1 |
-
from sentence_transformers import SentenceTransformer
|
| 2 |
-
from pymongo import MongoClient
|
| 3 |
-
import numpy as np
|
| 4 |
-
|
| 5 |
-
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 6 |
-
client = MongoClient()
|
| 7 |
-
db = client['huggingwhale']
|
| 8 |
-
collection = db['docs']
|
| 9 |
-
|
| 10 |
-
def chunk_text(text, chunk_size=300):
|
| 11 |
-
words = text.split()
|
| 12 |
-
return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
|
| 13 |
-
|
| 14 |
-
def embed_chunks(chunks):
|
| 15 |
-
return model.encode(chunks).tolist()
|
| 16 |
-
|
| 17 |
-
def store_embeddings(chunks, embeddings):
|
| 18 |
-
docs = [
|
| 19 |
-
{"chunk": chunk, "embedding": emb}
|
| 20 |
-
for chunk, emb in zip(chunks, embeddings)
|
| 21 |
-
]
|
| 22 |
-
collection.insert_many(docs)
|
| 23 |
-
|
| 24 |
-
def query_rag(question, top_k=3):
|
| 25 |
-
question_vec = model.encode([question])[0]
|
| 26 |
-
results = collection.aggregate([
|
| 27 |
-
{
|
| 28 |
-
"$vectorSearch": {
|
| 29 |
-
"index": "default",
|
| 30 |
-
"path": "embedding",
|
| 31 |
-
"queryVector": question_vec,
|
| 32 |
-
"numCandidates": 100,
|
| 33 |
-
"limit": top_k
|
| 34 |
-
}
|
| 35 |
-
}
|
| 36 |
-
])
|
| 37 |
-
return [doc['chunk'] for doc in results]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|