import logging import traceback import re from typing import Dict, List, Optional from landmark_data import ALL_LANDMARKS class TextFormattingError(Exception): """文本格式化過程中的自定義異常""" pass class TextFormatter: """ 文本格式化器 - 負責文本拼接、格式化和最終輸出優化 該類別處理所有與文本格式化相關的邏輯,包括智能文本拼接、 標點符號處理、大小寫規範化以及地標引用的過濾功能。 """ def __init__(self): """ 初始化文本格式化器 """ self.logger = logging.getLogger(self.__class__.__name__) try: # 載入地標數據用於引用過濾 self.landmark_data = self._load_landmark_data() self.logger.info("TextFormatter initialized successfully") except Exception as e: error_msg = f"Failed to initialize TextFormatter: {str(e)}" self.logger.error(f"{error_msg}\n{traceback.format_exc()}") raise TextFormattingError(error_msg) from e def _load_landmark_data(self) -> Dict: """ 載入地標數據 Returns: Dict: 地標數據字典 """ try: return ALL_LANDMARKS except ImportError: self.logger.warning("Failed to import landmark data, landmark filtering will be disabled") return {} except Exception as e: self.logger.warning(f"Error loading landmark data: {str(e)}") return {} def smart_append(self, current_text: str, new_fragment: str) -> str: """ 將新文本片段附加到現有文本,處理標點符號和大小寫 Args: current_text: 要加到的現有文本 new_fragment: 要加的新文本片段 Returns: str: 合併後的文本,具有適當的格式化 """ try: # 處理空值情況 if not new_fragment: return current_text if not current_text: # 確保第一個字符大寫 return new_fragment[0].upper() + new_fragment[1:] if new_fragment else "" # 清理現有文本 current_text = current_text.rstrip() # 檢查結尾標點符號 ends_with_sentence = current_text.endswith(('.', '!', '?')) ends_with_comma = current_text.endswith(',') # 特別處理 "A xxx A yyy" 模式 if (current_text.startswith("A ") or current_text.startswith("An ")) and \ (new_fragment.startswith("A ") or new_fragment.startswith("An ")): return current_text + ". " + new_fragment # 檢查新片段是否包含地標名稱(通常為專有名詞) has_landmark_name = any(word[0].isupper() for word in new_fragment.split() if len(word) > 2 and not word.startswith(("A ", "An ", "The "))) # 決定如何連接文本 if ends_with_sentence: # 句子後,以大寫開始並添加適當間距 joined_text = current_text + " " + (new_fragment[0].upper() + new_fragment[1:]) elif ends_with_comma: # 逗號後,要保持流暢性,除非是專有名詞或特殊情況 if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper() or has_landmark_name: joined_text = current_text + " " + new_fragment else: joined_text = current_text + " " + new_fragment[0].lower() + new_fragment[1:] elif "scene is" in new_fragment.lower() or "scene includes" in new_fragment.lower(): # 加關於場景的新句子時,使用句號 joined_text = current_text + ". " + new_fragment else: # 其他情況,根據內容決定 if self._is_related_phrases(current_text, new_fragment): if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper() or has_landmark_name: joined_text = current_text + ", " + new_fragment else: joined_text = current_text + ", " + new_fragment[0].lower() + new_fragment[1:] else: # 對不相關的短語使用句號 joined_text = current_text + ". " + (new_fragment[0].upper() + new_fragment[1:]) return joined_text except Exception as e: self.logger.warning(f"Error in smart_append: {str(e)}") # 備用簡單拼接 return f"{current_text} {new_fragment}" if current_text else new_fragment def _is_related_phrases(self, text1: str, text2: str) -> bool: """ 判斷兩個短語是否相關,應該用逗號 Args: text1: 第一個文本片段 text2: 要加的第二個文本片段 Returns: bool: 短語是否相關 """ try: # 檢查兩個短語是否都以 "A" 或 "An" 開始 - 這些是獨立的描述 if (text1.startswith("A ") or text1.startswith("An ")) and \ (text2.startswith("A ") or text2.startswith("An ")): return False # 這些是獨立的描述,不是相關短語 # 檢查第二個短語是否以連接詞開始 connecting_words = ["which", "where", "who", "whom", "whose", "with", "without", "this", "these", "that", "those", "and", "or", "but"] first_word = text2.split()[0].lower() if text2 else "" if first_word in connecting_words: return True # 檢查第一個短語是否以暗示連續性的內容結尾 ending_patterns = ["such as", "including", "like", "especially", "particularly", "for example", "for instance", "namely", "specifically"] for pattern in ending_patterns: if text1.lower().endswith(pattern): return True # 檢查兩個短語是否都關於場景 if "scene" in text1.lower() and "scene" in text2.lower(): return False # 關於場景的獨立陳述應該是分開的句子 return False except Exception as e: self.logger.warning(f"Error checking phrase relationship: {str(e)}") return False def format_final_description(self, text: str) -> str: """ 格式化最終描述文本,確保正確的標點符號、大小寫和間距 Args: text: 要格式化的文本 Returns: str: 格式化後的文本 """ try: if not text or not text.strip(): return "" # 首先修剪前導/尾隨空白 text = text.strip() # 1. 處理連續的 "A/An" 段落(可能將它們分成句子) text = re.sub(r'(A\s+[^.!?]+?[\w\.])\s+(A\s+)', r'\1. \2', text, flags=re.IGNORECASE) text = re.sub(r'(An\s+[^.!?]+?[\w\.])\s+(An?\s+)', r'\1. \2', text, flags=re.IGNORECASE) # 2. 確保整個文本的第一個字符大寫 if text: text = text[0].upper() + text[1:] # 3. 規範化空白:多個空格變為一個 text = re.sub(r'\s{2,}', ' ', text) # 4. 句子結尾標點符號後大寫 def capitalize_after_punctuation(match): return match.group(1) + match.group(2).upper() text = re.sub(r'([.!?]\s+)([a-z])', capitalize_after_punctuation, text) # 5. 處理逗號後的大小寫 def fix_capitalization_after_comma(match): leading_comma_space = match.group(1) # (,\s+) word_after_comma = match.group(2) # ([A-Z][a-zA-Z]*) proper_nouns_exceptions = ["I", "I'm", "I've", "I'd", "I'll", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"] if word_after_comma in proper_nouns_exceptions: return match.group(0) # 如果詞看起來像專有名詞(已經大寫且不是常用詞),保持不變 if len(word_after_comma) > 2 and word_after_comma[0].isupper() and word_after_comma.lower() not in ["this", "that", "these", "those", "they", "their", "then", "thus"]: return match.group(0) # 如果看起來已經是專有名詞則保持不變 return leading_comma_space + word_after_comma[0].lower() + word_after_comma[1:] text = re.sub(r'(,\s+)([A-Z][a-zA-Z\'\-]+)', fix_capitalization_after_comma, text) # 6. 修正標點符號周圍的間距 text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text) # 確保標點符號後有一個空格,前面沒有 text = text.replace(' .', '.').replace(' ,', ',') # 清理標點符號前可能的空格 # 7. 合併多個句子結尾標點符號 text = re.sub(r'[.!?]{2,}', '.', text) # 將多個轉換為單個句號 text = re.sub(r',+', ',', text) # 多個逗號變為一個 # 8. 確保文本以單個句子結尾標點符號結尾 text = text.strip() # 檢查最後一個字符前移除尾隨空白 if text and not text[-1] in '.!?': text += '.' # 9. 處理空的佔位符和前導標點符號 text = re.sub(r'\bIn\s*,\s*', 'In this scene, ', text) # 修復 "In , " 問題 text = re.sub(r'\s*,\s*([A-Z])', r'. \1', text) # 修復逗號後直接跟大寫字母的問題 text = re.sub(r'^[.,;:!?\s]+', '', text) # 移除前導標點符號 # 10. 第一個字母大寫的最終檢查 if text: text = text[0].upper() + text[1:] # 11. 移除最終標點符號前的空格(如果規則7意外添加) text = re.sub(r'\s+([.!?])$', r'\1', text) # 12. 移除重複性描述詞彙的最終檢查 identical_cleanup_patterns = [ (r'\b(\d+)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'), (r'\b(two|three|four|five|six|seven|eight|nine|ten|eleven|twelve)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'), (r'\bidentical\s+([a-zA-Z\s]+)', r'\1'), (r'\bcomprehensive arrangement of\b', 'arrangement of'), ] for pattern, replacement in identical_cleanup_patterns: text = re.sub(pattern, replacement, text, flags=re.IGNORECASE) return text.strip() # 最終修剪 except Exception as e: self.logger.warning(f"Error formatting final description: {str(e)}") # 備用基本格式化 if text: text = text.strip() if text and not text.endswith(('.', '!', '?')): text += '.' if text: text = text[0].upper() + text[1:] return text return "" def filter_landmark_references(self, text: str, enable_landmark: bool = True) -> str: """ 動態過濾文本中的地標引用 Args: text: 需要過濾的文本 enable_landmark: 是否啟用地標功能 Returns: str: 過濾後的文本 """ try: if enable_landmark or not text: return text # 動態收集所有地標名稱和位置 landmark_names = [] locations = [] for landmark_id, info in self.landmark_data.items(): # 收集地標名稱及其別名 landmark_names.append(info["name"]) landmark_names.extend(info.get("aliases", [])) # 收集地理位置 if "location" in info: location = info["location"] locations.append(location) # 處理分離的城市和國家名稱 parts = location.split(",") if len(parts) >= 1: locations.append(parts[0].strip()) if len(parts) >= 2: locations.append(parts[1].strip()) # 替換所有地標名稱 for name in landmark_names: if name and len(name) > 2: # 避免過短的名稱 text = re.sub(r'\b' + re.escape(name) + r'\b', "tall structure", text, flags=re.IGNORECASE) # 動態替換所有位置引用 for location in locations: if location and len(location) > 2: # 替換常見位置表述模式 text = re.sub(r'in ' + re.escape(location), "in the urban area", text, flags=re.IGNORECASE) text = re.sub(r'of ' + re.escape(location), "of the urban area", text, flags=re.IGNORECASE) text = re.sub(r'\b' + re.escape(location) + r'\b', "the urban area", text, flags=re.IGNORECASE) # 通用地標描述模式替換 landmark_patterns = [ (r'a (tourist|popular|famous) landmark', r'an urban structure'), (r'an iconic structure in ([A-Z][a-zA-Z\s,]+)', r'an urban structure in the area'), (r'a famous (monument|tower|landmark) in ([A-Z][a-zA-Z\s,]+)', r'an urban structure in the area'), (r'(centered|built|located|positioned) around the ([A-Z][a-zA-Z\s]+? (Tower|Monument|Landmark))', r'located in this area'), (r'(sightseeing|guided tours|cultural tourism) (at|around|near) (this landmark|the [A-Z][a-zA-Z\s]+)', r'\1 in this area'), (r'this (famous|iconic|historic|well-known) (landmark|monument|tower|structure)', r'this urban structure'), (r'([A-Z][a-zA-Z\s]+) Tower', r'tall structure'), (r'a (tower|structure) in ([A-Z][a-zA-Z\s,]+)', r'a \1 in the area'), (r'landmark scene', r'urban scene'), (r'tourist destination', r'urban area'), (r'tourist attraction', r'urban area') ] for pattern, replacement in landmark_patterns: text = re.sub(pattern, replacement, text, flags=re.IGNORECASE) return text except Exception as e: self.logger.warning(f"Error filtering landmark references: {str(e)}") return text def optimize_text_flow(self, text: str) -> str: """ 優化文本流暢性,減少重複和改善可讀性 Args: text: 要優化的文本 Returns: str: 優化後的文本 """ try: if not text: return text # 移除重複的短語 text = self._remove_duplicate_phrases(text) # 優化連接詞使用 text = self._optimize_connectors(text) # 平衡句子長度 text = self._balance_sentence_length(text) return text except Exception as e: self.logger.warning(f"Error optimizing text flow: {str(e)}") return text def _remove_duplicate_phrases(self, text: str) -> str: """ 移除文本中的重複短語 Args: text: 輸入文本 Returns: str: 移除重複後的文本 """ try: # 分割成句子 sentences = re.split(r'[.!?]+', text) unique_sentences = [] seen_content = set() for sentence in sentences: sentence = sentence.strip() if not sentence: continue # 規範化以進行比較(移除額外空白和標點符號) normalized = re.sub(r'\s+', ' ', sentence.lower().strip()) # 檢查是否實質相似 is_duplicate = False for seen in seen_content: if self._sentences_similar(normalized, seen): is_duplicate = True break if not is_duplicate: unique_sentences.append(sentence) seen_content.add(normalized) return '. '.join(unique_sentences) + '.' if unique_sentences else "" except Exception as e: self.logger.warning(f"Error removing duplicate phrases: {str(e)}") return text def _sentences_similar(self, sent1: str, sent2: str) -> bool: """ 檢查兩個句子是否相似 Args: sent1: 第一個句子 sent2: 第二個句子 Returns: bool: 句子是否相似 """ try: # 簡單的相似性檢查:如果80%的詞彙重疊 words1 = set(sent1.split()) words2 = set(sent2.split()) if not words1 or not words2: return False intersection = len(words1 & words2) union = len(words1 | words2) similarity = intersection / union if union > 0 else 0 return similarity > 0.8 except Exception as e: self.logger.warning(f"Error checking sentence similarity: {str(e)}") return False def _optimize_connectors(self, text: str) -> str: """ 優化連接詞的使用 Args: text: 輸入文本 Returns: str: 優化連接詞後的文本 """ try: # 替換重複的連接詞 text = re.sub(r'\band\s+and\b', 'and', text, flags=re.IGNORECASE) text = re.sub(r'\bwith\s+with\b', 'with', text, flags=re.IGNORECASE) # 改善過度使用 "and" 的情況 text = re.sub(r'(\w+),\s+and\s+(\w+),\s+and\s+(\w+)', r'\1, \2, and \3', text) return text except Exception as e: self.logger.warning(f"Error optimizing connectors: {str(e)}") return text def _balance_sentence_length(self, text: str) -> str: """ 平衡句子長度,分割過長的句子 Args: text: 輸入文本 Returns: str: 平衡句子長度後的文本 """ try: sentences = re.split(r'([.!?]+)', text) balanced_text = "" for i in range(0, len(sentences), 2): if i + 1 < len(sentences): sentence = sentences[i] punctuation = sentences[i + 1] # 如果句子太長(超過150個字符),嘗試在適當位置分割 if len(sentence) > 150: # 在逗號或連接詞處分割 split_points = [m.start() for m in re.finditer(r',\s+(?:and|but|or|while|when|where)', sentence)] if split_points: mid_point = split_points[len(split_points) // 2] first_part = sentence[:mid_point].strip() second_part = sentence[mid_point + 1:].strip() if second_part and not second_part[0].isupper(): second_part = second_part[0].upper() + second_part[1:] balanced_text += first_part + ". " + second_part + punctuation + " " else: balanced_text += sentence + punctuation + " " else: balanced_text += sentence + punctuation + " " return balanced_text.strip() except Exception as e: self.logger.warning(f"Error balancing sentence length: {str(e)}") return text def validate_text_quality(self, text: str) -> Dict[str, bool]: """ 驗證文本質量 Args: text: 要驗證的文本 Returns: Dict[str, bool]: 質量檢查結果 """ try: quality_checks = { "has_content": bool(text and text.strip()), "proper_capitalization": bool(text and text[0].isupper()) if text else False, "ends_with_punctuation": bool(text and text.strip()[-1] in '.!?') if text else False, "no_double_spaces": " " not in text if text else True, "no_leading_punctuation": not bool(re.match(r'^[.,;:!?]', text.strip())) if text else True, "reasonable_length": 20 <= len(text) <= 1000 if text else False } return quality_checks except Exception as e: self.logger.warning(f"Error validating text quality: {str(e)}") return {"error": True} def get_text_statistics(self, text: str) -> Dict[str, int]: """ 獲取文本統計信息 Args: text: 要分析的文本 Returns: Dict[str, int]: 文本統計信息 """ try: if not text: return {"characters": 0, "words": 0, "sentences": 0} characters = len(text) words = len(text.split()) sentences = len(re.findall(r'[.!?]+', text)) return { "characters": characters, "words": words, "sentences": sentences } except Exception as e: self.logger.warning(f"Error getting text statistics: {str(e)}") return {"characters": 0, "words": 0, "sentences": 0} def deduplicate_sentences_in_description(self, description: str, similarity_threshold: float = 0.80) -> str: """ 從一段描述文本中移除重複或高度相似的句子。 此方法會嘗試保留更長、資訊更豐富的句子版本。 Args: description (str): 原始描述文本。 similarity_threshold (float): 判斷句子是否相似的 Jaccard 相似度閾值 (0 到 1)。 預設為 0.8,表示詞彙重疊度達到80%即視為相似。 Returns: str: 移除了重複或高度相似句子後的文本。 """ try: if not description or not description.strip(): self.logger.debug("deduplicate_sentences_in_description: Received empty or blank description.") return "" # 使用正則表達式分割句子,保留句尾標點符號 sentences = re.split(r'(?<=[.!?])\s+', description.strip()) if not sentences: self.logger.debug("deduplicate_sentences_in_description: No sentences found after splitting.") return "" unique_sentences_data = [] # 存儲 (原始句子文本, 該句子的詞彙集合) for current_sentence_text in sentences: current_sentence_text = current_sentence_text.strip() if not current_sentence_text: continue # 預處理當前句子以進行比較:轉小寫、移除標點、分割成詞彙集合 simplified_current_text = re.sub(r'[^\w\s\d]', '', current_sentence_text.lower()) # 保留數字 current_sentence_words = set(simplified_current_text.split()) if not current_sentence_words: # 如果處理後是空集合 (例如句子只包含標點) # 如果原始句子有內容(例如只有一個標點),就保留它 if current_sentence_text and not unique_sentences_data: # 避免在開頭加入孤立標點 unique_sentences_data.append((current_sentence_text, current_sentence_words)) continue is_subsumed_or_highly_similar = False index_to_replace = -1 for i, (kept_sentence_text, kept_sentence_words) in enumerate(unique_sentences_data): if not kept_sentence_words: # 跳過已保留的空詞彙集合 continue # 計算 Jaccard 相似度 intersection_len = len(current_sentence_words.intersection(kept_sentence_words)) union_len = len(current_sentence_words.union(kept_sentence_words)) jaccard_similarity = 0.0 if union_len > 0: jaccard_similarity = intersection_len / union_len elif not current_sentence_words and not kept_sentence_words: # 兩個都是空的 jaccard_similarity = 1.0 if jaccard_similarity >= similarity_threshold: # 如果當前句子比已保留的句子長,則標記替換舊的 if len(current_sentence_words) > len(kept_sentence_words): self.logger.debug(f"Deduplication: Replacing shorter \"{kept_sentence_text[:50]}...\" " f"with longer similar \"{current_sentence_text[:50]}...\" (Jaccard: {jaccard_similarity:.2f})") index_to_replace = i break # 找到一個可以被替換的,就跳出內層循環 # 如果當前句子比已保留的句子短,或者長度相近但內容高度相似,則標記當前句子為重複 else: # current_sentence_words is shorter or of similar length is_subsumed_or_highly_similar = True self.logger.debug(f"Deduplication: Current sentence \"{current_sentence_text[:50]}...\" " f"is subsumed by or highly similar to \"{kept_sentence_text[:50]}...\" (Jaccard: {jaccard_similarity:.2f}). Skipping.") break if index_to_replace != -1: unique_sentences_data[index_to_replace] = (current_sentence_text, current_sentence_words) elif not is_subsumed_or_highly_similar: unique_sentences_data.append((current_sentence_text, current_sentence_words)) # 從 unique_sentences_data 中提取最終的句子文本 final_sentences = [s_data[0] for s_data in unique_sentences_data] # 重組句子,確保每個句子以標點符號結尾,並且句子間有空格 reconstructed_response = "" for i, s_text in enumerate(final_sentences): s_text = s_text.strip() if not s_text: continue # 確保句子以標點結尾 if not re.search(r'[.!?]$', s_text): s_text += "." reconstructed_response += s_text if i < len(final_sentences) - 1: # 如果不是最後一句,添加空格 reconstructed_response += " " self.logger.debug(f"Deduplicated description (len {len(reconstructed_response.strip())}): '{reconstructed_response.strip()[:150]}...'") return reconstructed_response.strip() except Exception as e: self.logger.error(f"Error in deduplicate_sentences_in_description: {str(e)}") self.logger.error(traceback.format_exc()) return description # 發生錯誤時返回原始描述