Mohammaderfan koupaei commited on
Commit
6b418f0
·
1 Parent(s): 941a5b8

Add application file

Browse files
app.py CHANGED
@@ -17,16 +17,7 @@ def main():
17
  logger = logging.getLogger(__name__)
18
  logger.info("Initializing training process...")
19
  import os
20
- import nltk
21
-
22
- # Set up NLTK data directory
23
- nltk_data_path = "./nltk_data"
24
- os.makedirs(nltk_data_path, exist_ok=True)
25
- nltk.data.path.append(nltk_data_path)
26
-
27
- # Ensure NLTK resources are available
28
- nltk.download('punkt', download_dir=nltk_data_path, quiet=True)
29
- nltk.download('stopwords', download_dir=nltk_data_path, quiet=True)
30
 
31
  # Set up logging
32
  logging.basicConfig(level=logging.INFO)
 
17
  logger = logging.getLogger(__name__)
18
  logger.info("Initializing training process...")
19
  import os
20
+
 
 
 
 
 
 
 
 
 
21
 
22
  # Set up logging
23
  logging.basicConfig(level=logging.INFO)
requirements.txt CHANGED
@@ -7,3 +7,5 @@ tiktoken
7
  sentencepiece
8
  pandas
9
  numpy
 
 
 
7
  sentencepiece
8
  pandas
9
  numpy
10
+ spacy
11
+ en_core_web_sm
scripts/data_processing/data_preparation.py CHANGED
@@ -3,9 +3,7 @@ import numpy as np
3
  from pathlib import Path
4
  from typing import Dict, List, Tuple
5
  import re
6
- import nltk
7
- from nltk.tokenize import word_tokenize, sent_tokenize
8
- from nltk.corpus import stopwords
9
  from sklearn.feature_extraction.text import TfidfVectorizer
10
  from sklearn.model_selection import StratifiedKFold
11
  import torch
@@ -25,10 +23,9 @@ class AdvancedNarrativeProcessor:
25
  # Initialize tokenizer
26
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
27
 
28
- # Initialize NLTK
29
- nltk.download('punkt', quiet=True)
30
- nltk.download('stopwords', quiet=True)
31
- self.stopwords = set(stopwords.words('english'))
32
 
33
  # Initialize state
34
  self.df = None
@@ -119,9 +116,11 @@ class AdvancedNarrativeProcessor:
119
  return text.strip()
120
 
121
  def extract_features(self, text: str) -> Dict:
122
- """Extract rich text features"""
123
- words = word_tokenize(text)
124
- sentences = sent_tokenize(text)
 
 
125
 
126
  return {
127
  'length': len(words),
 
3
  from pathlib import Path
4
  from typing import Dict, List, Tuple
5
  import re
6
+ import spacy
 
 
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.model_selection import StratifiedKFold
9
  import torch
 
23
  # Initialize tokenizer
24
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
25
 
26
+ # Initialize SpaCy
27
+ self.nlp = spacy.load("en_core_web_sm") # Download it with `python -m spacy download en_core_web_sm`
28
+ self.stopwords = spacy.lang.en.stop_words.STOP_WORDS
 
29
 
30
  # Initialize state
31
  self.df = None
 
116
  return text.strip()
117
 
118
  def extract_features(self, text: str) -> Dict:
119
+ """Extract rich text features using SpaCy."""
120
+ # Process text with SpaCy
121
+ doc = self.nlp(text)
122
+ words = [token.text for token in doc if not token.is_space]
123
+ sentences = list(doc.sents)
124
 
125
  return {
126
  'length': len(words),