Spaces:

uuuy5615
/

test

Runtime error

App Files Files Community

uuuy5615 commited on 2 days ago

Commit

a73dd87

verified ·

1 Parent(s): 68a1205

Update backend/spellchecker.py

Browse files

Files changed (1) hide show

backend/spellchecker.py +191 -191

backend/spellchecker.py CHANGED Viewed

@@ -1,191 +1,191 @@
-import json
-import difflib
-from hanspell import spell_checker
-from hanspell.constants import CheckResult
-from kiwipiepy import Kiwi
-ERROR_TYPE_MAPPING = {
-    CheckResult.PASSED: 0,  # 문제가 없는 단어 또는 구절
-    CheckResult.WRONG_SPELLING: 1,  # 맞춤법에 문제가 있는 단어 또는 구절
-    CheckResult.WRONG_SPACING: 2,  # 띄어쓰기에 문제가 있는 단어 또는 구절
-    CheckResult.AMBIGUOUS: 3,  # 표준어가 의심되는 단어 또는 구절
-    CheckResult.STATISTICAL_CORRECTION: 4,  # 통계적 교정에 따른 단어 또는 구절
-}
-import difflib
-def update_corrections_by_error_text(original_text, checked_text, corrections):
-    updated = []
-    for corr in corrections:
-        error = corr["error"]
-        start_pos = original_text.find(error)
-        if start_pos == -1:
-            # error 문장을 못 찾은 경우 position 기반으로 fallback
-            start_pos = corr["position"]
-        length = len(error)
-        # checked_text에서 동일 위치 추정
-        corrected_text = checked_text[start_pos : start_pos + length]
-        new_corr = corr.copy()
-        new_corr["checked"] = corrected_text
-        new_corr["position"] = start_pos  # 위치 보정
-        new_corr["length"] = length
-        updated.append(new_corr)
-    return updated
-def extract_phrase(text: str, position: int) -> str:
-    if position < 0 or position >= len(text):
-        return ""
-    # 왼쪽 탐색: position - 1 부터 공백이 나올 때까지
-    left = position - 1
-    while left >= 0 and text[left] != " ":
-        left -= 1
-    # 오른쪽 탐색: position + 1 부터 공백이 나올 때까지
-    right = position + 1
-    while right < len(text) and text[right] != " ":
-        right += 1
-    return text[left + 1 : right]
-def get_space_diffs(original: str, corrected: str):
-    diffs = []
-    orig_len = len(original)
-    corr_len = len(corrected)
-    o_idx = c_idx = 0
-    while o_idx < orig_len and c_idx < corr_len:
-        o_char = original[o_idx]
-        c_char = corrected[c_idx]
-        # 동일 문자면 통과
-        if o_char == c_char:
-            o_idx += 1
-            c_idx += 1
-            continue
-        # 원문에 공백이 있고 교정문에 없으면 → delete_space
-        if o_char == " " and c_char != " ":
-            error = extract_phrase(original, o_idx)
-            check = spell_checker.check(error).as_dict()["checked"]
-            diffs.append(
-                {
-                    "error": error,
-                    "checked": check,
-                    "position": o_idx,
-                    "length": -1,
-                    "errortype": ERROR_TYPE_MAPPING[2],
-                }
-            )
-            o_idx += 1  # 공백을 넘김
-        # 교정문에 공백이 있고 원문에 없으면 → insert_space
-        elif c_char == " " and o_char != " ":
-            # 공백을 그 "앞 문자" 뒤에 삽입한다고 가정
-            error = extract_phrase(original, o_idx)
-            check = spell_checker.check(error).as_dict()["checked"]
-            diffs.append(
-                {
-                    "error": error,
-                    "checked": check,
-                    "position": o_idx,  # 원문 기준 삽입 위치
-                    "length": 1,
-                    "errortype": ERROR_TYPE_MAPPING[2],
-                }
-            )
-            c_idx += 1  # 공백을 넘김
-        # 둘 다 다르지만 공백도 아닐 때 (문법 교정 등): 그냥 넘김
-        else:
-            o_idx += 1
-            c_idx += 1
-    return diffs
-def check(text: str):
-    ch_text = spell_checker.check(text)
-    info = ch_text.as_dict()
-    orig_text = info["original"]
-    corr_text = info["checked"]
-    time = info["time"]
-    if orig_text == corr_text:
-        flag = 0
-    else:
-        flag = 1
-    print(info["words"])
-    space = get_space_diffs(orig_text, corr_text)
-    # 1) original↔corrected 간 문자 단위 매핑 생성
-    sm = difflib.SequenceMatcher(None, orig_text, corr_text)
-    mapping = {}
-    for tag, i1, i2, j1, j2 in sm.get_opcodes():
-        if tag == "equal":
-            # 일치 블록: 1:1 매핑
-            for offset in range(i2 - i1):
-                mapping[j1 + offset] = i1 + offset
-        elif tag in ("replace", "insert"):
-            # 교체블록·삽입블록: 교정문자 모두 원본 블록 시작 위치로 매핑
-            for offset in range(j2 - j1):
-                mapping[j1 + offset] = i1
-    # 2) 토큰별로 위치 및 원래 틀린 단어 추출
-    corrections = []
-    for token, status in info["words"].items():
-        if status == CheckResult.PASSED or status == CheckResult.WRONG_SPACING:
-            continue
-        corr_pos = corr_text.find(token)
-        if corr_pos != -1 and corr_pos in mapping:
-            orig_pos = mapping[corr_pos]
-            # 원본 텍스트에서 token 길이만큼 잘라낸다다.
-            error_word = orig_text[orig_pos : orig_pos + len(token)]
-        else:
-            orig_pos = None
-            error_word = token
-        length = len(error_word)
-        corrections.append(
-            {
-                "error": error_word,
-                "checked": token,
-                "position": orig_pos,
-                "length": length,
-                "errortype": ERROR_TYPE_MAPPING[status],
-            }
-        )
-    combined = corrections + space
-    sorted_combined = sorted(combined, key=lambda x: x["position"])
-    result = {
-        "flag": flag,
-        "original_text": info["original"],
-        "checked_text": info["checked"],
-        "corrections": sorted_combined,
-        "time": time,
-    }
-    return result
-if __name__ == "__main__":
-    sample = "나는 오늘 아침밥을 먹고 학교 를 갔다.학교 를 아는 친구들이 많치만, 오늘은 별루 보이지 않았다. 학교앞 문구점에서 볼펜을 샀는데, 그 볼펜은 잉크가 자주 말라서 자주 바꿔야한다. 학교에서 학교 행사에 대한 얘기를 들었는데, 별루 기대는 안된다."
-    sample2 = "현대 교육은 단순히 지식을 전달하는 것을 넘어서, 학생의 전인적 성잘을 목표로 한다. 이에 따라 정서적 지지와 사회성 교육도 점점 중요해지고 있있다. 그러나 아직도 많은 학교에서는 주입식 교육이 중심이 되어, 학생들이 주도적으로 학습할 기회가 적다. 또한, 교사들의 과도한 행정업무로 인해 수업 준비에 충분한 시간을 가질수 없고, 이는 교육의 질 저하로 이어질 수 있따. 지속적인 교사 연수와 교육환경 개선이 뒷받침되어야만 미래형 교육이 실현될 수 있슬 것이다."
-    output = check(sample2)
-    print(json.dumps(output, ensure_ascii=False, indent=2))
-    print(sample2[79])
-    # "flag": 문장에 맞춤법 오류가 있는지의 여부(0: 없음/1: 있음)
-    # "original_text": 원본 문장
-    # "checked_text": 맞춤법이 수정된 문장
-    # "corrections"[
-    # {
-    #   "error": 맞춤법이 틀린 단어
-    #   "position": 틀린 단어의 문장 내 위치(시작점)
-    #   "errortype": 오류 유형(1~4)
-    # },
-    # ]
-    # "time": 소요 시간

+import json
+import difflib
+from backend.hanspell import spell_checker
+from backend.hanspell.constants import CheckResult
+from kiwipiepy import Kiwi
+ERROR_TYPE_MAPPING = {
+    CheckResult.PASSED: 0,  # 문제가 없는 단어 또는 구절
+    CheckResult.WRONG_SPELLING: 1,  # 맞춤법에 문제가 있는 단어 또는 구절
+    CheckResult.WRONG_SPACING: 2,  # 띄어쓰기에 문제가 있는 단어 또는 구절
+    CheckResult.AMBIGUOUS: 3,  # 표준어가 의심되는 단어 또는 구절
+    CheckResult.STATISTICAL_CORRECTION: 4,  # 통계적 교정에 따른 단어 또는 구절
+}
+import difflib
+def update_corrections_by_error_text(original_text, checked_text, corrections):
+    updated = []
+    for corr in corrections:
+        error = corr["error"]
+        start_pos = original_text.find(error)
+        if start_pos == -1:
+            # error 문장을 못 찾은 경우 position 기반으로 fallback
+            start_pos = corr["position"]
+        length = len(error)
+        # checked_text에서 동일 위치 추정
+        corrected_text = checked_text[start_pos : start_pos + length]
+        new_corr = corr.copy()
+        new_corr["checked"] = corrected_text
+        new_corr["position"] = start_pos  # 위치 보정
+        new_corr["length"] = length
+        updated.append(new_corr)
+    return updated
+def extract_phrase(text: str, position: int) -> str:
+    if position < 0 or position >= len(text):
+        return ""
+    # 왼쪽 탐색: position - 1 부터 공백이 나올 때까지
+    left = position - 1
+    while left >= 0 and text[left] != " ":
+        left -= 1
+    # 오른쪽 탐색: position + 1 부터 공백이 나올 때까지
+    right = position + 1
+    while right < len(text) and text[right] != " ":
+        right += 1
+    return text[left + 1 : right]
+def get_space_diffs(original: str, corrected: str):
+    diffs = []
+    orig_len = len(original)
+    corr_len = len(corrected)
+    o_idx = c_idx = 0
+    while o_idx < orig_len and c_idx < corr_len:
+        o_char = original[o_idx]
+        c_char = corrected[c_idx]
+        # 동일 문자면 통과
+        if o_char == c_char:
+            o_idx += 1
+            c_idx += 1
+            continue
+        # 원문에 공백이 있고 교정문에 없으면 → delete_space
+        if o_char == " " and c_char != " ":
+            error = extract_phrase(original, o_idx)
+            check = spell_checker.check(error).as_dict()["checked"]
+            diffs.append(
+                {
+                    "error": error,
+                    "checked": check,
+                    "position": o_idx,
+                    "length": -1,
+                    "errortype": ERROR_TYPE_MAPPING[2],
+                }
+            )
+            o_idx += 1  # 공백을 넘김
+        # 교정문에 공백이 있고 원문에 없으면 → insert_space
+        elif c_char == " " and o_char != " ":
+            # 공백을 그 "앞 문자" 뒤에 삽입한다고 가정
+            error = extract_phrase(original, o_idx)
+            check = spell_checker.check(error).as_dict()["checked"]
+            diffs.append(
+                {
+                    "error": error,
+                    "checked": check,
+                    "position": o_idx,  # 원문 기준 삽입 위치
+                    "length": 1,
+                    "errortype": ERROR_TYPE_MAPPING[2],
+                }
+            )
+            c_idx += 1  # 공백을 넘김
+        # 둘 다 다르지만 공백도 아닐 때 (문법 교정 등): 그냥 넘김
+        else:
+            o_idx += 1
+            c_idx += 1
+    return diffs
+def check(text: str):
+    ch_text = spell_checker.check(text)
+    info = ch_text.as_dict()
+    orig_text = info["original"]
+    corr_text = info["checked"]
+    time = info["time"]
+    if orig_text == corr_text:
+        flag = 0
+    else:
+        flag = 1
+    print(info["words"])
+    space = get_space_diffs(orig_text, corr_text)
+    # 1) original↔corrected 간 문자 단위 매핑 생성
+    sm = difflib.SequenceMatcher(None, orig_text, corr_text)
+    mapping = {}
+    for tag, i1, i2, j1, j2 in sm.get_opcodes():
+        if tag == "equal":
+            # 일치 블록: 1:1 매핑
+            for offset in range(i2 - i1):
+                mapping[j1 + offset] = i1 + offset
+        elif tag in ("replace", "insert"):
+            # 교체블록·삽입블록: 교정문자 모두 원본 블록 시작 위치로 매핑
+            for offset in range(j2 - j1):
+                mapping[j1 + offset] = i1
+    # 2) 토큰별로 위치 및 원래 틀린 단어 추출
+    corrections = []
+    for token, status in info["words"].items():
+        if status == CheckResult.PASSED or status == CheckResult.WRONG_SPACING:
+            continue
+        corr_pos = corr_text.find(token)
+        if corr_pos != -1 and corr_pos in mapping:
+            orig_pos = mapping[corr_pos]
+            # 원본 텍스트에서 token 길이만큼 잘라낸다다.
+            error_word = orig_text[orig_pos : orig_pos + len(token)]
+        else:
+            orig_pos = None
+            error_word = token
+        length = len(error_word)
+        corrections.append(
+            {
+                "error": error_word,
+                "checked": token,
+                "position": orig_pos,
+                "length": length,
+                "errortype": ERROR_TYPE_MAPPING[status],
+            }
+        )
+    combined = corrections + space
+    sorted_combined = sorted(combined, key=lambda x: x["position"])
+    result = {
+        "flag": flag,
+        "original_text": info["original"],
+        "checked_text": info["checked"],
+        "corrections": sorted_combined,
+        "time": time,
+    }
+    return result
+if __name__ == "__main__":
+    sample = "나는 오늘 아침밥을 먹고 학교 를 갔다.학교 를 아는 친구들이 많치만, 오늘은 별루 보이지 않았다. 학교앞 문구점에서 볼펜을 샀는데, 그 볼펜은 잉크가 자주 말라서 자주 바꿔야한다. 학교에서 학교 행사에 대한 얘기를 들었는데, 별루 기대는 안된다."
+    sample2 = "현대 교육은 단순히 지식을 전달하는 것을 넘어서, 학생의 전인적 성잘을 목표로 한다. 이에 따라 정서적 지지와 사회성 교육도 점점 중요해지고 있있다. 그러나 아직도 많은 학교에서는 주입식 교육이 중심이 되어, 학생들이 주도적으로 학습할 기회가 적다. 또한, 교사들의 과도한 행정업무로 인해 수업 준비에 충분한 시간을 가질수 없고, 이는 교육의 질 저하로 이어질 수 있따. 지속적인 교사 연수와 교육환경 개선이 뒷받침되어야만 미래형 교육이 실현될 수 있슬 것이다."
+    output = check(sample2)
+    print(json.dumps(output, ensure_ascii=False, indent=2))
+    print(sample2[79])
+    # "flag": 문장에 맞춤법 오류가 있는지의 여부(0: 없음/1: 있음)
+    # "original_text": 원본 문장
+    # "checked_text": 맞춤법이 수정된 문장
+    # "corrections"[
+    # {
+    #   "error": 맞춤법이 틀린 단어
+    #   "position": 틀린 단어의 문장 내 위치(시작점)
+    #   "errortype": 오류 유형(1~4)
+    # },
+    # ]
+    # "time": 소요 시간