Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -115,6 +115,18 @@ def normalise_hyphens(text):
|
|
115 |
# Replace hyphen variants with U+002D for internal consistency
|
116 |
return text.replace('\u2011', '-').replace('\u2013', '-').replace('\u2014', '-')
|
117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
def encode_text_fragment(text):
|
119 |
# Encode text for W3C Text Fragments, preserving only regular hyphens (U+002D)
|
120 |
# Non-breaking hyphens (U+2011) are encoded as %E2%80%91
|
@@ -131,8 +143,9 @@ def generate_citation_hash(author, year, url, fragment_text, cited_text, usernam
|
|
131 |
return hashlib.sha256(data.encode('utf-8')).hexdigest()
|
132 |
|
133 |
def format_citation_html(url, fragment_text, author, year, scc_hash):
|
134 |
-
#
|
135 |
-
|
|
|
136 |
full_url = f"{url}#:~:text={encoded_fragment}"
|
137 |
return f'<a href="{full_url}" data-hash="{scc_hash}">{author} ({year})</a>'
|
138 |
|
@@ -385,7 +398,9 @@ with tabs[0]:
|
|
385 |
normalised_task_name = normalise_hyphens(task_name)
|
386 |
scc_hash = generate_citation_hash(author_name, publication_year, source_url, normalised_annotated_text, normalised_annotated_text, username, normalised_task_name, current_date, current_time)
|
387 |
citation_link_start = format_citation_html(source_url, annotated_text, author_name, publication_year, scc_hash)
|
388 |
-
|
|
|
|
|
389 |
metadata_link = format_metadata_html(source_url, author_name, publication_year, scc_hash, username, task_name, current_date, current_time)
|
390 |
|
391 |
col_html1, col_html2 = st.columns(2)
|
|
|
115 |
# Replace hyphen variants with U+002D for internal consistency
|
116 |
return text.replace('\u2011', '-').replace('\u2013', '-').replace('\u2014', '-')
|
117 |
|
118 |
+
def select_longest_segment(text):
|
119 |
+
# Split text by various dashes (hyphen, non-breaking hyphen, en dash, em dash)
|
120 |
+
dash_variants = ['\u002D', '\u2011', '\u2013', '\u2014']
|
121 |
+
segments = [text]
|
122 |
+
for dash in dash_variants:
|
123 |
+
new_segments = []
|
124 |
+
for segment in segments:
|
125 |
+
new_segments.extend(segment.split(dash))
|
126 |
+
segments = new_segments
|
127 |
+
# Return the longest segment, or original text if no dashes
|
128 |
+
return max(segments, key=len, default=text).strip()
|
129 |
+
|
130 |
def encode_text_fragment(text):
|
131 |
# Encode text for W3C Text Fragments, preserving only regular hyphens (U+002D)
|
132 |
# Non-breaking hyphens (U+2011) are encoded as %E2%80%91
|
|
|
143 |
return hashlib.sha256(data.encode('utf-8')).hexdigest()
|
144 |
|
145 |
def format_citation_html(url, fragment_text, author, year, scc_hash):
|
146 |
+
# Select the longest segment for the text fragment to avoid breaking the link
|
147 |
+
selected_fragment = select_longest_segment(fragment_text)
|
148 |
+
encoded_fragment = encode_text_fragment(selected_fragment)
|
149 |
full_url = f"{url}#:~:text={encoded_fragment}"
|
150 |
return f'<a href="{full_url}" data-hash="{scc_hash}">{author} ({year})</a>'
|
151 |
|
|
|
398 |
normalised_task_name = normalise_hyphens(task_name)
|
399 |
scc_hash = generate_citation_hash(author_name, publication_year, source_url, normalised_annotated_text, normalised_annotated_text, username, normalised_task_name, current_date, current_time)
|
400 |
citation_link_start = format_citation_html(source_url, annotated_text, author_name, publication_year, scc_hash)
|
401 |
+
# Use the longest segment for the end-of-text citation link
|
402 |
+
selected_fragment = select_longest_segment(annotated_text)
|
403 |
+
citation_link_end = f'<a href="{source_url}#:~:text={encode_text_fragment(selected_fragment)}" data-hash="{scc_hash}">({author_name}, {publication_year})</a>'
|
404 |
metadata_link = format_metadata_html(source_url, author_name, publication_year, scc_hash, username, task_name, current_date, current_time)
|
405 |
|
406 |
col_html1, col_html2 = st.columns(2)
|