|
import pandas as pd |
|
from typing import Generator, Tuple |
|
|
|
def chunk_dataframe_with_context( |
|
df: pd.DataFrame, |
|
chunk_size: int = 10, |
|
overlap: int = 3 |
|
) -> Generator[Tuple[pd.DataFrame, pd.DataFrame], None, None]: |
|
""" |
|
Chunk the DataFrame into overlapping segments. For each core chunk, |
|
include 'overlap' rows before and after as additional context. |
|
|
|
Yields: |
|
(core_df, extended_df): |
|
core_df = The rows we actually want to process in this chunk |
|
extended_df = core_df + overlap context before and after |
|
""" |
|
start = 0 |
|
total_rows = len(df) |
|
|
|
while start < total_rows: |
|
|
|
core_start = start |
|
core_end = min(start + chunk_size, total_rows) |
|
|
|
|
|
extended_start = max(0, core_start - overlap) |
|
extended_end = min(core_end + overlap, total_rows) |
|
|
|
core_df = df.iloc[core_start:core_end] |
|
extended_df = df.iloc[extended_start:extended_end] |
|
|
|
yield core_df, extended_df |
|
|
|
start += chunk_size |
|
|
|
if __name__ == "__main__": |
|
df = pd.DataFrame({ |
|
"Task ID": range(1, 26), |
|
"Description": [f"Task {i}" for i in range(1, 26)] |
|
}) |
|
for i, (core_df, extended_df) in enumerate(chunk_dataframe_with_context(df, chunk_size=5, overlap=2)): |
|
print(f"CHUNK {i} - Core:\n{core_df}\n\nContext:\n{extended_df}\n{'-'*40}") |
|
|