Namitg02 commited on
Commit
370332e
·
verified ·
1 Parent(s): c984729

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -65
app.py CHANGED
@@ -13,73 +13,17 @@ from transformers import AutoTokenizer
13
  from transformers import AutoModelForCausalLM
14
  from transformers import TextIteratorStreamer
15
  from threading import Thread
16
- from torchtext.data import to_map_style_dataset
17
 
18
  llm_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
19
  tokenizer = AutoTokenizer.from_pretrained(llm_model)
20
  # pulling tokeinzer for text generation model
21
 
22
- #import numpy as np
23
-
24
- datasetiter = load_dataset("Namitg02/Test", split='train', streaming=False)
25
-
26
- def is_iterable_dataset(datasetiter):
27
- return isinstance(datasetiter, torch.utils.data.IterableDataset)
28
-
29
- def is_map_style_dataset(datasetiter):
30
- return isinstance(datasetiter, torch.utils.data.Dataset)
31
-
32
- if is_iterable_dataset(datasetiter):
33
- print("The datasetiter dataset is iterable-style.")
34
- else:
35
- print("The datasetiter dataset is map-style.")
36
-
37
-
38
-
39
- from torch.utils.data import Dataset, IterableDataset
40
-
41
- class MyIterableDataset(IterableDataset):
42
- def __init__(self, iterable):
43
- super().__init__()
44
- self.iterable = iterable
45
-
46
- def __iter__(self):
47
- return iter(self.iterable)
48
-
49
- class MapStyleDataset(Dataset):
50
- def __init__(self, iterable):
51
- super().__init__()
52
- self.data = list(iterable)
53
-
54
- def __len__(self):
55
- return len(self.data)
56
-
57
- def __getitem__(self, idx):
58
- return self.data[idx]
59
-
60
-
61
- # Create an iterable
62
- #iterable = "Namitg02/Test"
63
-
64
- # Convert the iterable to a MapStyle dataset
65
- map_style_dataset = MapStyleDataset(iterable)
66
-
67
- # Create a DataLoader for the MapStyle dataset
68
- data_loader = torch.utils.data.DataLoader(map_style_dataset, batch_size=2)
69
-
70
-
71
-
72
-
73
-
74
- #datasetiter = load_dataset("Namitg02/Test", split='train', streaming=False)
75
- #dataset = to_map_style_dataset(datasetiter)
76
-
77
-
78
  #dataset = load_dataset("not-lain/wikipedia",revision = "embedded")
79
  #dataset = load_dataset("epfl-llm/guidelines", split='train')
80
  #Returns a list of dictionaries, each representing a row in the dataset.
81
- print(map_style_dataset[1])
82
- length = len(map_style_dataset)
83
 
84
  #Itemdetails = dataset.items()
85
  #print(Itemdetails)
@@ -91,18 +35,18 @@ embedding_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
91
  #doc_func = lambda x: x.text
92
  #dataset = list(map(doc_func, dataset))
93
 
94
- def embedder(map_style_dataset):
95
- embeddings = embedding_model.encode(map_style_dataset["text"])
96
- map_style_dataset = map_style_dataset.add_column('embeddings', embeddings)
97
- return map_style_dataset
98
- updated_dataset = map_style_dataset.map(embedder)
99
  dataset['text'][:length]
100
 
101
  #print(embeddings)
102
 
103
  print(updated_dataset[1])
104
  print(updated_dataset[2])
105
- print(map_style_dataset[1])
106
 
107
  embedding_dim = embedding_model.get_sentence_embedding_dimension()
108
  #data = FAISS.from_embeddings(embed, embedding_model)
 
13
  from transformers import AutoModelForCausalLM
14
  from transformers import TextIteratorStreamer
15
  from threading import Thread
 
16
 
17
  llm_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
18
  tokenizer = AutoTokenizer.from_pretrained(llm_model)
19
  # pulling tokeinzer for text generation model
20
 
21
+ dataset = load_dataset("Namitg02/Test", split='train', streaming=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  #dataset = load_dataset("not-lain/wikipedia",revision = "embedded")
23
  #dataset = load_dataset("epfl-llm/guidelines", split='train')
24
  #Returns a list of dictionaries, each representing a row in the dataset.
25
+ print(dataset[1])
26
+ length = len(dataset)
27
 
28
  #Itemdetails = dataset.items()
29
  #print(Itemdetails)
 
35
  #doc_func = lambda x: x.text
36
  #dataset = list(map(doc_func, dataset))
37
 
38
+ def embedder(dataset):
39
+ embeddings = embedding_model.encode(dataset["text"])
40
+ dataset = dataset.add_column('embeddings', embeddings)
41
+ return dataset
42
+ updated_dataset = dataset.map(embedder)
43
  dataset['text'][:length]
44
 
45
  #print(embeddings)
46
 
47
  print(updated_dataset[1])
48
  print(updated_dataset[2])
49
+ print(dataset[1])
50
 
51
  embedding_dim = embedding_model.get_sentence_embedding_dimension()
52
  #data = FAISS.from_embeddings(embed, embedding_model)