bhlewis commited on
Commit
eaf6036
·
verified ·
1 Parent(s): 778d4fa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -6
app.py CHANGED
@@ -8,6 +8,10 @@ from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.metrics.pairwise import cosine_similarity
9
  import re
10
  from collections import Counter
 
 
 
 
11
 
12
  def load_data():
13
  try:
@@ -52,14 +56,13 @@ tfidf_vectorizer = TfidfVectorizer(stop_words='english')
52
  tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
53
 
54
  def extract_key_features(text):
55
- # Extract noun phrases as potential key features
56
- noun_phrases = re.findall(r'\b(?:[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b', text)
57
- # Extract phrases following "comprising", "including", "consisting of" as potential key features
58
- feature_phrases = re.findall(r'(?:comprising|including|consisting of)\s+(.*?)(?:;|\.)', text, re.IGNORECASE)
59
 
60
  all_features = noun_phrases + feature_phrases
61
- # Remove duplicates and lowercase
62
- return list(set(feature.lower() for feature in all_features))
63
 
64
  def compare_features(query_features, patent_features):
65
  common_features = set(query_features) & set(patent_features)
 
8
  from sklearn.metrics.pairwise import cosine_similarity
9
  import re
10
  from collections import Counter
11
+ import spacy
12
+
13
+ # Load Spacy model for advanced NLP
14
+ nlp = spacy.load("en_core_web_sm")
15
 
16
  def load_data():
17
  try:
 
56
  tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
57
 
58
  def extract_key_features(text):
59
+ # Use Spacy to extract noun phrases and key phrases
60
+ doc = nlp(text)
61
+ noun_phrases = [chunk.text.lower() for chunk in doc.noun_chunks]
62
+ feature_phrases = [sent.text.lower() for sent in doc.sents if re.search(r'(comprising|including|consisting of)', sent.text, re.IGNORECASE)]
63
 
64
  all_features = noun_phrases + feature_phrases
65
+ return list(set(all_features))
 
66
 
67
  def compare_features(query_features, patent_features):
68
  common_features = set(query_features) & set(patent_features)