78 lines
3.1 KiB
Python
78 lines
3.1 KiB
Python
import json
|
|
|
|
def split_long_segments(segments, max_chars=150):
|
|
"""Split segments that are too long into smaller chunks."""
|
|
import re
|
|
|
|
new_segments = []
|
|
for segment in segments:
|
|
if "text" in segment and len(segment["text"]) > max_chars:
|
|
# Split text at sentence boundaries or by character count
|
|
sentences = re.split(r'(?<=[.!?]) +', segment["text"])
|
|
current_text = ""
|
|
start_time = segment["start"]
|
|
|
|
for sentence in sentences:
|
|
if len(current_text) + len(sentence) > max_chars and current_text:
|
|
# Calculate proportional time based on text length
|
|
portion = len(current_text) / len(segment["text"])
|
|
mid_time = segment["start"] + portion * (segment["end"] - segment["start"])
|
|
|
|
new_segments.append({
|
|
"start": start_time,
|
|
"end": mid_time,
|
|
"text": current_text.strip(),
|
|
"speaker": segment.get("speaker", ""),
|
|
"speaker_text": segment.get("speaker_text", f"[{segment.get('speaker', '')}] ") # Fixed line
|
|
})
|
|
|
|
start_time = mid_time
|
|
current_text = sentence
|
|
else:
|
|
current_text += " " + sentence if current_text else sentence
|
|
|
|
# Add the last part
|
|
if current_text:
|
|
new_segments.append({
|
|
"start": start_time,
|
|
"end": segment["end"],
|
|
"text": current_text.strip(),
|
|
"speaker": segment.get("speaker", ""),
|
|
"speaker_text": segment.get("speaker_text", f"[{segment.get('speaker', '')}] ") # Fixed line
|
|
})
|
|
else:
|
|
new_segments.append(segment)
|
|
|
|
return new_segments
|
|
|
|
# Create mock test data
|
|
test_segments = [
|
|
{
|
|
"start": 0.0,
|
|
"end": 10.0,
|
|
"speaker": "SPEAKER_00",
|
|
"speaker_text": "[SPEAKER_00] ",
|
|
"text": "This is a very long text that exceeds the maximum character limit. It should be split into multiple segments. This is another sentence to make sure we have enough text to split. And one more sentence to be really sure."
|
|
}
|
|
]
|
|
|
|
# Run the split_long_segments function
|
|
print("Testing split_long_segments...")
|
|
split_segments = split_long_segments(test_segments, max_chars=50)
|
|
print(f"Number of segments after splitting: {len(split_segments)}")
|
|
|
|
# Verify that all segments have the speaker_text field
|
|
all_have_speaker_text = all("speaker_text" in segment for segment in split_segments)
|
|
print(f"All segments have speaker_text field: {all_have_speaker_text}")
|
|
|
|
# Dump the result to inspect
|
|
print("\nSplit segments:")
|
|
print(json.dumps(split_segments, indent=2))
|
|
|
|
# Check if we can access speaker_text without error
|
|
try:
|
|
for segment in split_segments:
|
|
speaker_text = segment["speaker_text"]
|
|
print("\n✅ Successfully accessed speaker_text on all segments")
|
|
except KeyError as e:
|
|
print(f"\n❌ KeyError when accessing: {e}")
|