unijoh commited on
Commit
34202a3
·
verified ·
1 Parent(s): b8a1a0d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -23
app.py CHANGED
@@ -3,32 +3,31 @@ import pandas as pd
3
 
4
  # Load and parse the CSV file from Hugging Face
5
  def load_data():
6
- url = "https://huggingface.co/datasets/unijoh/RAVNlex/blob/main/RAVNlex_small.csv"
 
7
  lemmas = {}
8
  current_lemma = None
9
 
10
- with open(url, 'r', encoding='iso-8859-10') as file:
11
- reader = pd.read_csv(file, delimiter='\t', encoding='iso-8859-10', dtype=str, quoting=csv.QUOTE_NONE)
12
- for row in reader.itertuples(index=False, name=None):
13
- if len(row) < 5:
14
- print(f"Skipping problematic line {reader.line_num}: {row}")
15
- continue
16
- orto, ppos, phon1, phon2, comm, *pronunciations = row
17
- if orto == '---':
18
- current_lemma = None
19
- elif current_lemma is None:
20
- current_lemma = orto.replace("ORTO:", "")
21
- lemmas[current_lemma] = []
22
- else:
23
- lemma_data = {
24
- 'word': orto.replace("ORTO:", "") if orto else "",
25
- 'PPOS': ppos.replace("PPOS:", "") if ppos else "",
26
- 'PHON1': phon1.replace("PHON:", "") if phon1 else "",
27
- 'PHON2': phon2.replace("PHON:", "") if phon2 else "",
28
- 'COMM': comm if comm else "",
29
- 'pronunciations': pronunciations
30
- }
31
- lemmas[current_lemma].append(lemma_data)
32
 
33
  print("Loaded lemmas:", lemmas) # Debugging output
34
  return lemmas
 
3
 
4
  # Load and parse the CSV file from Hugging Face
5
  def load_data():
6
+ url = "https://huggingface.co/datasets/unijoh/RAVNlex/resolve/main/RAVNlex_small.csv"
7
+ df = pd.read_csv(url, delimiter='\t', encoding='iso-8859-10', dtype=str)
8
  lemmas = {}
9
  current_lemma = None
10
 
11
+ for row in df.itertuples(index=False, name=None):
12
+ if len(row) < 5:
13
+ print(f"Skipping problematic line: {row}")
14
+ continue
15
+ orto, ppos, phon1, phon2, comm, *pronunciations = row
16
+ if orto == '---':
17
+ current_lemma = None
18
+ elif current_lemma is None:
19
+ current_lemma = orto.replace("ORTO:", "")
20
+ lemmas[current_lemma] = []
21
+ else:
22
+ lemma_data = {
23
+ 'word': orto.replace("ORTO:", "") if orto else "",
24
+ 'PPOS': ppos.replace("PPOS:", "") if ppos else "",
25
+ 'PHON1': phon1.replace("PHON:", "") if phon1 else "",
26
+ 'PHON2': phon2.replace("PHON:", "") if phon2 else "",
27
+ 'COMM': comm if comm else "",
28
+ 'pronunciations': pronunciations
29
+ }
30
+ lemmas[current_lemma].append(lemma_data)
 
 
31
 
32
  print("Loaded lemmas:", lemmas) # Debugging output
33
  return lemmas