Pclanglais commited on
Commit
4405d10
·
verified ·
1 Parent(s): 7a72935

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -26
app.py CHANGED
@@ -111,28 +111,38 @@ def split_text(text, max_tokens=500):
111
  return chunks
112
 
113
  def create_bibtex_entry(data):
114
- author = data.get('Author', '')
115
- title = data.get('Title', '')
116
- journal = data.get('Journal', '')
117
- year = data.get('Year', '')
118
- volume = data.get('Volume', '')
119
- pages = data.get('Pages', '')
120
- doi = data.get('Doi', '')
121
 
122
  # Remove "doi: " prefix if present
123
  doi = doi.replace('doi: ', '')
124
 
125
- bibtex = f"""@article{{idnothing,
126
- author = {{{author}}},
127
- title = {{{title}}},
128
- journal = {{{journal}}},
129
- year = {{{year}}},
130
- volume = {{{volume}}},
131
- pages = {{{pages}}},
132
- doi = {{{doi}}}
133
- }}"""
 
134
  return bibtex
135
 
 
 
 
 
 
 
 
 
 
136
  def transform_chunks(marianne_segmentation):
137
  marianne_segmentation = pd.DataFrame(marianne_segmentation)
138
  marianne_segmentation = marianne_segmentation[marianne_segmentation['entity_group'] != 'separator']
@@ -155,17 +165,24 @@ def transform_chunks(marianne_segmentation):
155
  else:
156
  bibtex_data[entity_group] = word
157
  current_entity = entity_group
158
- elif entity_group == 'None' and current_entity:
159
- bibtex_data[current_entity] += ' ' + word
 
 
 
160
 
161
- if entity_group == 'title':
162
- html_output.append(f'<div class="manuscript"><div class="annotation">{result_entity}</div><div class="content title-content"><h2>{word}</h2></div></div>')
163
- elif entity_group == 'bibliography':
164
- html_output.append(f'<div class="manuscript"><div class="annotation">{result_entity}</div><div class="content bibliography-content">{word}</div></div>')
165
- elif entity_group == 'paratext':
166
- html_output.append(f'<div class="manuscript"><div class="annotation">{result_entity}</div><div class="content paratext-content">{word}</div></div>')
167
- else:
168
- html_output.append(f'<div class="manuscript"><div class="annotation">{result_entity}</div><div class="content">{word}</div></div>')
 
 
 
 
169
 
170
  bibtex_entry = create_bibtex_entry(bibtex_data)
171
 
 
111
  return chunks
112
 
113
  def create_bibtex_entry(data):
114
+ author = data.get('Author', '').strip()
115
+ title = data.get('Title', '').strip()
116
+ journal = data.get('Journal', '').strip()
117
+ year = data.get('Year', '').strip()
118
+ volume = data.get('Volume', '').strip()
119
+ pages = data.get('Pages', '').strip()
120
+ doi = data.get('Doi', '').strip()
121
 
122
  # Remove "doi: " prefix if present
123
  doi = doi.replace('doi: ', '')
124
 
125
+ bibtex = "@article{idnothing,\n"
126
+ if author: bibtex += f" author = {{{author}}},\n"
127
+ if title: bibtex += f" title = {{{title}}},\n"
128
+ if journal: bibtex += f" journal = {{{journal}}},\n"
129
+ if year: bibtex += f" year = {{{year}}},\n"
130
+ if volume: bibtex += f" volume = {{{volume}}},\n"
131
+ if pages: bibtex += f" pages = {{{pages}}},\n"
132
+ if doi: bibtex += f" doi = {{{doi}}},\n"
133
+ bibtex += "}"
134
+
135
  return bibtex
136
 
137
+ These changes should result in a more complete and accurate BibTeX entry. The fields will only be included if they have content, and all the information from the input should now be properly captured.
138
+
139
+ If you're still experiencing issues after making these changes, please provide an example of the input text you're using and the output you're getting. This will help me further diagnose and resolve any remaining problems.
140
+ Claude does not have the ability to run the code it generates yet.
141
+ Claude can make mistakes. Please double-check responses.
142
+
143
+
144
+
145
+
146
  def transform_chunks(marianne_segmentation):
147
  marianne_segmentation = pd.DataFrame(marianne_segmentation)
148
  marianne_segmentation = marianne_segmentation[marianne_segmentation['entity_group'] != 'separator']
 
165
  else:
166
  bibtex_data[entity_group] = word
167
  current_entity = entity_group
168
+ elif entity_group == 'None':
169
+ if current_entity:
170
+ bibtex_data[current_entity] += ' ' + word
171
+ else:
172
+ bibtex_data['None'] = bibtex_data.get('None', '') + ' ' + word
173
 
174
+ html_output.append(f'<div class="manuscript"><div class="annotation">{result_entity}</div><div class="content">{word}</div></div>')
175
+
176
+ # Extract year from the 'None' field if present
177
+ none_content = bibtex_data.get('None', '')
178
+ year_match = re.search(r'\((\d{4})\)', none_content)
179
+ if year_match:
180
+ bibtex_data['Year'] = year_match.group(1)
181
+
182
+ # Extract volume from the 'None' field if present
183
+ volume_match = re.search(r',\s*(\d+),', none_content)
184
+ if volume_match:
185
+ bibtex_data['Volume'] = volume_match.group(1)
186
 
187
  bibtex_entry = create_bibtex_entry(bibtex_data)
188