mabuseif commited on
Commit
15231dd
·
verified ·
1 Parent(s): 6a28cf0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -15
app.py CHANGED
@@ -115,15 +115,6 @@ def normalise_hyphens(text):
115
  # Replace hyphen variants with U+002D for internal consistency
116
  return text.replace('\u2011', '-').replace('\u2013', '-').replace('\u2014', '-')
117
 
118
- def select_longest_segment(text):
119
- # Split text by various dashes (hyphen, en dash, em dash)
120
- dash_variants = ['\u002D', '\u2011', '\u2013', '\u2014']
121
- segments = text
122
- for dash in dash_variants:
123
- segments = segments.split(dash)
124
- # Return the longest segment, or original text if no dashes
125
- return max(segments, key=len, default=text).strip()
126
-
127
  def encode_text_fragment(text):
128
  # Encode text for W3C Text Fragments, preserving only regular hyphens (U+002D)
129
  # Non-breaking hyphens (U+2011) are encoded as %E2%80%91
@@ -131,12 +122,13 @@ def encode_text_fragment(text):
131
  # Em dashes (U+2014) are encoded as %E2%80%94
132
  return urllib.parse.quote(text, safe='-')
133
 
134
- def format_citation_html(url, fragment_text, author, year, scc_hash):
135
- # Select the longest segment for the text fragment to avoid breaking the link
136
- selected_fragment = select_longest_segment(fragment_text)
137
- encoded_fragment = encode_text_fragment(selected_fragment)
138
- full_url = f"{url}#:~:text={encoded_fragment}"
139
- return f'<a href="{full_url}" data-hash="{scc_hash}">{author} ({year})</a>'
 
140
 
141
  def format_citation_html(url, fragment_text, author, year, scc_hash):
142
  # Use original fragment_text for text fragment URL to match external source
 
115
  # Replace hyphen variants with U+002D for internal consistency
116
  return text.replace('\u2011', '-').replace('\u2013', '-').replace('\u2014', '-')
117
 
 
 
 
 
 
 
 
 
 
118
  def encode_text_fragment(text):
119
  # Encode text for W3C Text Fragments, preserving only regular hyphens (U+002D)
120
  # Non-breaking hyphens (U+2011) are encoded as %E2%80%91
 
122
  # Em dashes (U+2014) are encoded as %E2%80%94
123
  return urllib.parse.quote(text, safe='-')
124
 
125
+ def generate_citation_hash(author, year, url, fragment_text, cited_text, username, task_name, current_date, current_time):
126
+ # Normalise hyphens for consistent hash generation
127
+ normalised_fragment_text = normalise_hyphens(fragment_text)
128
+ normalised_cited_text = normalise_hyphens(cited_text)
129
+ normalised_task_name = normalise_hyphens(task_name)
130
+ data = f"{author}, {year} | {url} | {normalised_fragment_text} | {normalised_cited_text} | {username} | {normalised_task_name} | {current_date} | {current_time}"
131
+ return hashlib.sha256(data.encode('utf-8')).hexdigest()
132
 
133
  def format_citation_html(url, fragment_text, author, year, scc_hash):
134
  # Use original fragment_text for text fragment URL to match external source