mabuseif commited on
Commit
9e4c5a7
·
verified ·
1 Parent(s): 201ac1d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -5
app.py CHANGED
@@ -115,12 +115,22 @@ def normalise_hyphens(text):
115
  # Replace hyphen variants with U+002D for internal consistency
116
  return text.replace('\u2011', '-').replace('\u2013', '-').replace('\u2014', '-')
117
 
 
 
 
 
 
 
 
 
 
 
 
118
  def encode_text_fragment(text):
 
 
119
  # Encode text for W3C Text Fragments, preserving only regular hyphens (U+002D)
120
- # Non-breaking hyphens (U+2011) are encoded as %E2%80%91
121
- # En dashes (U+2013) are encoded as %E2%80%93
122
- # Em dashes (U+2014) are encoded as %E2%80%94
123
- return urllib.parse.quote(text, safe='-')
124
 
125
  def generate_citation_hash(author, year, url, fragment_text, cited_text, username, task_name, current_date, current_time):
126
  # Normalise hyphens for consistent hash generation
@@ -131,7 +141,7 @@ def generate_citation_hash(author, year, url, fragment_text, cited_text, usernam
131
  return hashlib.sha256(data.encode('utf-8')).hexdigest()
132
 
133
  def format_citation_html(url, fragment_text, author, year, scc_hash):
134
- # Use original fragment_text for text fragment URL to match external source
135
  encoded_fragment = encode_text_fragment(fragment_text)
136
  full_url = f"{url}#:~:text={encoded_fragment}"
137
  return f'<a href="{full_url}" data-hash="{scc_hash}">{author} ({year})</a>'
 
115
  # Replace hyphen variants with U+002D for internal consistency
116
  return text.replace('\u2011', '-').replace('\u2013', '-').replace('\u2014', '-')
117
 
118
+ def get_longest_segment(text):
119
+ # Split text by various dash types (hyphen, en dash, em dash, non-breaking hyphen)
120
+ dash_pattern = r'[\u002D\u2011\u2013\u2014]'
121
+ segments = re.split(dash_pattern, text)
122
+ # Remove empty segments and strip whitespace
123
+ segments = [segment.strip() for segment in segments if segment.strip()]
124
+ if not segments:
125
+ return text # Return original text if no valid segments
126
+ # Return the longest segment
127
+ return max(segments, key=len)
128
+
129
  def encode_text_fragment(text):
130
+ # Get the longest segment if text contains dashes
131
+ fragment_text = get_longest_segment(text)
132
  # Encode text for W3C Text Fragments, preserving only regular hyphens (U+002D)
133
+ return urllib.parse.quote(fragment_text, safe='-')
 
 
 
134
 
135
  def generate_citation_hash(author, year, url, fragment_text, cited_text, username, task_name, current_date, current_time):
136
  # Normalise hyphens for consistent hash generation
 
141
  return hashlib.sha256(data.encode('utf-8')).hexdigest()
142
 
143
  def format_citation_html(url, fragment_text, author, year, scc_hash):
144
+ # Use the longest segment for the text fragment URL
145
  encoded_fragment = encode_text_fragment(fragment_text)
146
  full_url = f"{url}#:~:text={encoded_fragment}"
147
  return f'<a href="{full_url}" data-hash="{scc_hash}">{author} ({year})</a>'