mabuseif commited on
Commit
f0d676b
·
verified ·
1 Parent(s): 15231dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -3
app.py CHANGED
@@ -115,6 +115,18 @@ def normalise_hyphens(text):
115
  # Replace hyphen variants with U+002D for internal consistency
116
  return text.replace('\u2011', '-').replace('\u2013', '-').replace('\u2014', '-')
117
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def encode_text_fragment(text):
119
  # Encode text for W3C Text Fragments, preserving only regular hyphens (U+002D)
120
  # Non-breaking hyphens (U+2011) are encoded as %E2%80%91
@@ -131,8 +143,9 @@ def generate_citation_hash(author, year, url, fragment_text, cited_text, usernam
131
  return hashlib.sha256(data.encode('utf-8')).hexdigest()
132
 
133
  def format_citation_html(url, fragment_text, author, year, scc_hash):
134
- # Use original fragment_text for text fragment URL to match external source
135
- encoded_fragment = encode_text_fragment(fragment_text)
 
136
  full_url = f"{url}#:~:text={encoded_fragment}"
137
  return f'<a href="{full_url}" data-hash="{scc_hash}">{author} ({year})</a>'
138
 
@@ -385,7 +398,9 @@ with tabs[0]:
385
  normalised_task_name = normalise_hyphens(task_name)
386
  scc_hash = generate_citation_hash(author_name, publication_year, source_url, normalised_annotated_text, normalised_annotated_text, username, normalised_task_name, current_date, current_time)
387
  citation_link_start = format_citation_html(source_url, annotated_text, author_name, publication_year, scc_hash)
388
- citation_link_end = f'<a href="{source_url}#:~:text={encode_text_fragment(annotated_text)}" data-hash="{scc_hash}">({author_name}, {publication_year})</a>'
 
 
389
  metadata_link = format_metadata_html(source_url, author_name, publication_year, scc_hash, username, task_name, current_date, current_time)
390
 
391
  col_html1, col_html2 = st.columns(2)
 
115
  # Replace hyphen variants with U+002D for internal consistency
116
  return text.replace('\u2011', '-').replace('\u2013', '-').replace('\u2014', '-')
117
 
118
+ def select_longest_segment(text):
119
+ # Split text by various dashes (hyphen, non-breaking hyphen, en dash, em dash)
120
+ dash_variants = ['\u002D', '\u2011', '\u2013', '\u2014']
121
+ segments = [text]
122
+ for dash in dash_variants:
123
+ new_segments = []
124
+ for segment in segments:
125
+ new_segments.extend(segment.split(dash))
126
+ segments = new_segments
127
+ # Return the longest segment, or original text if no dashes
128
+ return max(segments, key=len, default=text).strip()
129
+
130
  def encode_text_fragment(text):
131
  # Encode text for W3C Text Fragments, preserving only regular hyphens (U+002D)
132
  # Non-breaking hyphens (U+2011) are encoded as %E2%80%91
 
143
  return hashlib.sha256(data.encode('utf-8')).hexdigest()
144
 
145
  def format_citation_html(url, fragment_text, author, year, scc_hash):
146
+ # Select the longest segment for the text fragment to avoid breaking the link
147
+ selected_fragment = select_longest_segment(fragment_text)
148
+ encoded_fragment = encode_text_fragment(selected_fragment)
149
  full_url = f"{url}#:~:text={encoded_fragment}"
150
  return f'<a href="{full_url}" data-hash="{scc_hash}">{author} ({year})</a>'
151
 
 
398
  normalised_task_name = normalise_hyphens(task_name)
399
  scc_hash = generate_citation_hash(author_name, publication_year, source_url, normalised_annotated_text, normalised_annotated_text, username, normalised_task_name, current_date, current_time)
400
  citation_link_start = format_citation_html(source_url, annotated_text, author_name, publication_year, scc_hash)
401
+ # Use the longest segment for the end-of-text citation link
402
+ selected_fragment = select_longest_segment(annotated_text)
403
+ citation_link_end = f'<a href="{source_url}#:~:text={encode_text_fragment(selected_fragment)}" data-hash="{scc_hash}">({author_name}, {publication_year})</a>'
404
  metadata_link = format_metadata_html(source_url, author_name, publication_year, scc_hash, username, task_name, current_date, current_time)
405
 
406
  col_html1, col_html2 = st.columns(2)