File size: 2,831 Bytes
42fa84c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from llama_index.core import StorageContext
from llama_index.embeddings.gemini import GeminiEmbedding
import google.generativeai as genai

from llama_index.vector_stores.pinecone import PineconeVectorStore
from pinecone import Pinecone


from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.core.ingestion import IngestionPipeline

from research_assistant_app.constants import gemini_api_key, pinecone_api_key
from research_assistant_app.components.data_ingestion import get_cleaned_dir_docs


from research_assistant_app.constants import gemini_api_key, pinecone_api_key
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.gemini import Gemini
import google.generativeai as genai
from llama_index.core import Settings
from llama_index.core.node_parser import SentenceSplitter


genai.configure(api_key=gemini_api_key)  # configuring api to run the pipeline
model = Gemini(models="gemini-pro", api_key=gemini_api_key, temperature=0.3)
gemini_embed_model = GeminiEmbedding(model_name="models/embedding-001")

embed_model = gemini_embed_model

Settings.llm = model
Settings.embed_model = gemini_embed_model
Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
Settings.num_output = 512
Settings.context_window = 3900


# Define the initial pipeline
pipeline = IngestionPipeline(
    transformations=[
        SemanticSplitterNodeParser(
            buffer_size=1,
            breakpoint_percentile_threshold=95,
            embed_model=embed_model,
        ),
        embed_model,
    ],
)


pc = Pinecone(api_key=pinecone_api_key)
pinecone_index = pc.Index(
    "ai-research-assistant"
)  # `ai-research-assistant` is the index name

vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# cleaned_docs = get_cleaned_dir_docs()
# print(cleaned_docs, "Check 1")

pipeline = IngestionPipeline(
    transformations=[
        SemanticSplitterNodeParser(
            buffer_size=1,
            breakpoint_percentile_threshold=95,
            embed_model=embed_model,
        ),
        embed_model,
    ],
    vector_store=vector_store,  # Our new addition
)


# Now we run our pipeline!
def run_indexing_pipeline(docs):
    genai.configure(api_key=gemini_api_key)  # configuring api to run the pipeline

    pipeline.run(documents=docs)

    # print(pinecone_index.describe_index_stats(), "pincone index")
    return pinecone_index.describe_index_stats()


# >>> {'dimension': 1536,
# >>> 'index_fullness': 0.0,
# >>> 'namespaces': {'': {'vector_count': 46}},
# >>> 'total_vector_count': 46}

if __name__ == "__main__":
    cleaned_docs = get_cleaned_dir_docs("Data")

    index_stats = run_indexing_pipeline(cleaned_docs[:3])

    print(index_stats, "pincone index")