Spaces:
Runtime error
Runtime error
Mohammaderfan koupaei
commited on
Commit
·
6b418f0
1
Parent(s):
941a5b8
Add application file
Browse files- app.py +1 -10
- requirements.txt +2 -0
- scripts/data_processing/data_preparation.py +9 -10
app.py
CHANGED
@@ -17,16 +17,7 @@ def main():
|
|
17 |
logger = logging.getLogger(__name__)
|
18 |
logger.info("Initializing training process...")
|
19 |
import os
|
20 |
-
|
21 |
-
|
22 |
-
# Set up NLTK data directory
|
23 |
-
nltk_data_path = "./nltk_data"
|
24 |
-
os.makedirs(nltk_data_path, exist_ok=True)
|
25 |
-
nltk.data.path.append(nltk_data_path)
|
26 |
-
|
27 |
-
# Ensure NLTK resources are available
|
28 |
-
nltk.download('punkt', download_dir=nltk_data_path, quiet=True)
|
29 |
-
nltk.download('stopwords', download_dir=nltk_data_path, quiet=True)
|
30 |
|
31 |
# Set up logging
|
32 |
logging.basicConfig(level=logging.INFO)
|
|
|
17 |
logger = logging.getLogger(__name__)
|
18 |
logger.info("Initializing training process...")
|
19 |
import os
|
20 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
# Set up logging
|
23 |
logging.basicConfig(level=logging.INFO)
|
requirements.txt
CHANGED
@@ -7,3 +7,5 @@ tiktoken
|
|
7 |
sentencepiece
|
8 |
pandas
|
9 |
numpy
|
|
|
|
|
|
7 |
sentencepiece
|
8 |
pandas
|
9 |
numpy
|
10 |
+
spacy
|
11 |
+
en_core_web_sm
|
scripts/data_processing/data_preparation.py
CHANGED
@@ -3,9 +3,7 @@ import numpy as np
|
|
3 |
from pathlib import Path
|
4 |
from typing import Dict, List, Tuple
|
5 |
import re
|
6 |
-
import
|
7 |
-
from nltk.tokenize import word_tokenize, sent_tokenize
|
8 |
-
from nltk.corpus import stopwords
|
9 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
10 |
from sklearn.model_selection import StratifiedKFold
|
11 |
import torch
|
@@ -25,10 +23,9 @@ class AdvancedNarrativeProcessor:
|
|
25 |
# Initialize tokenizer
|
26 |
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
27 |
|
28 |
-
# Initialize
|
29 |
-
|
30 |
-
|
31 |
-
self.stopwords = set(stopwords.words('english'))
|
32 |
|
33 |
# Initialize state
|
34 |
self.df = None
|
@@ -119,9 +116,11 @@ class AdvancedNarrativeProcessor:
|
|
119 |
return text.strip()
|
120 |
|
121 |
def extract_features(self, text: str) -> Dict:
|
122 |
-
"""Extract rich text features"""
|
123 |
-
|
124 |
-
|
|
|
|
|
125 |
|
126 |
return {
|
127 |
'length': len(words),
|
|
|
3 |
from pathlib import Path
|
4 |
from typing import Dict, List, Tuple
|
5 |
import re
|
6 |
+
import spacy
|
|
|
|
|
7 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
8 |
from sklearn.model_selection import StratifiedKFold
|
9 |
import torch
|
|
|
23 |
# Initialize tokenizer
|
24 |
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
25 |
|
26 |
+
# Initialize SpaCy
|
27 |
+
self.nlp = spacy.load("en_core_web_sm") # Download it with `python -m spacy download en_core_web_sm`
|
28 |
+
self.stopwords = spacy.lang.en.stop_words.STOP_WORDS
|
|
|
29 |
|
30 |
# Initialize state
|
31 |
self.df = None
|
|
|
116 |
return text.strip()
|
117 |
|
118 |
def extract_features(self, text: str) -> Dict:
|
119 |
+
"""Extract rich text features using SpaCy."""
|
120 |
+
# Process text with SpaCy
|
121 |
+
doc = self.nlp(text)
|
122 |
+
words = [token.text for token in doc if not token.is_space]
|
123 |
+
sentences = list(doc.sents)
|
124 |
|
125 |
return {
|
126 |
'length': len(words),
|