ccm commited on
Commit
b68bdb3
·
verified ·
1 Parent(s): 76004d8

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +6 -0
main.py CHANGED
@@ -24,6 +24,12 @@ LLM_MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
24
 
25
  # Load the dataset and convert to pandas
26
  data = pandas.read_parquet("hf://datasets/ccm/rag-idetc/data/train-00000-of-00001.parquet")
 
 
 
 
 
 
27
 
28
  # Load the model for later use in embeddings
29
  model = sentence_transformers.SentenceTransformer(EMBEDDING_MODEL_NAME)
 
24
 
25
  # Load the dataset and convert to pandas
26
  data = pandas.read_parquet("hf://datasets/ccm/rag-idetc/data/train-00000-of-00001.parquet")
27
+ # Filter out any publications without an abstract
28
+ title_is_empty = [
29
+ title is "" for title in data["title"].values
30
+ ]
31
+ data = data[~pandas.Series(title_is_empty)]
32
+ data.reset_index(inplace=True)\
33
 
34
  # Load the model for later use in embeddings
35
  model = sentence_transformers.SentenceTransformer(EMBEDDING_MODEL_NAME)