transcript-diarize-video/whisper_parallel.sh

#!/bin/bash

# Load environment variables from .env file
if [ -f .env ]; then
  echo "Loading environment variables from .env file..."
  set -o allexport
  source .env
  set +o allexport
else
  echo "Warning: .env file not found. Using default values."
fi

# === CONFIGURAZIONE ===
# These defaults will be used if not set in .env file
KEY_NAME=${KEY_NAME:-"whisper-key"}
KEY_FILE=${KEY_FILE:-"$HOME/.ssh/${KEY_NAME}.pem"}
SECURITY_GROUP=${SECURITY_GROUP:-"whisper-sg"}
INSTANCE_TYPE=${INSTANCE_TYPE:-"g4dn.12xlarge"}  # Default a 1 GPU per rispettare limiti vCPU
REGION=${REGION:-"eu-south-1"}
AMI_ID=${AMI_ID:-"ami-059603706d3734615"}
VIDEO_FILE=${VIDEO_FILE:-"mio_video.mp4"}
ORIGINAL_FILENAME=$(basename "$VIDEO_FILE" | cut -d. -f1)
START_MIN=${START_MIN:-0}      # Default value if not set
END_MIN=${END_MIN:-0}        # Default value if not set
SHIFT_SECONDS=${SHIFT_SECONDS:-0}  # Shift timestamps by this many seconds
SHIFT_ONLY=${SHIFT_ONLY:-false}   # Set to true to only perform shifting on existing files
INPUT_PREFIX=${INPUT_PREFIX:-""}   # Prefix for input files when using SHIFT_ONLY
GPU_COUNT=${GPU_COUNT:-1}      # Numero di GPU da utilizzare (default: 1)
NUM_SPEAKERS=${NUM_SPEAKERS:-""}  # Numero di speaker se conosciuto (opzionale)
FIX_START=${FIX_START:-"true"}  # Aggiunge silenzio all'inizio per catturare i primi secondi

# === FUNZIONE PER SHIFT DEI TIMESTAMPS ===
shift_timestamps() {
    local input_file=$1
    local output_file=$2
    local shift_by=$3
    local file_ext="${input_file##*.}"

    if [ "$file_ext" = "srt" ]; then
        echo "🕒 Shifting SRT timestamps by $shift_by seconds..."
        # SRT format: 00:00:05,440 --> 00:00:08,300
        awk -v shift=$shift_by '
        function time_to_seconds(time_str) {
            split(time_str, parts, ",")
            split(parts[1], time_parts, ":")
            return time_parts[1]*3600 + time_parts[2]*60 + time_parts[3] + parts[2]/1000
        }

        function seconds_to_time(seconds) {
            h = int(seconds/3600)
            m = int((seconds-h*3600)/60)
            s = int(seconds-h*3600-m*60)
            ms = int((seconds - int(seconds))*1000)
            return sprintf("%02d:%02d:%02d,%03d", h, m, s, ms)
        }

        {
            if (match($0, /^([0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3}) --> ([0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3})$/)) {
                start_time = time_to_seconds(substr($0, RSTART, RLENGTH/2-5))
                end_time = time_to_seconds(substr($0, RSTART+RLENGTH/2+5, RLENGTH/2-5))

                new_start = start_time + shift
                new_end = end_time + shift

                # Handle negative times (not allowed in SRT)
                if (new_start < 0) new_start = 0
                if (new_end < 0) new_end = 0

                print seconds_to_time(new_start)" --> "seconds_to_time(new_end)
            } else {
                print $0
            }
        }' "$input_file" > "$output_file"

    elif [ "$file_ext" = "vtt" ]; then
        echo "🕒 Shifting VTT timestamps by $shift_by seconds..."
        # VTT format: 00:00:05.440 --> 00:00:08.300
        awk -v shift=$shift_by '
        function time_to_seconds(time_str) {
            split(time_str, parts, ".")
            split(parts[1], time_parts, ":")
            return time_parts[1]*3600 + time_parts[2]*60 + time_parts[3] + parts[2]/1000
        }

        function seconds_to_time(seconds) {
            h = int(seconds/3600)
            m = int((seconds-h*3600)/60)
            s = int(seconds-h*3600-m*60)
            ms = int((seconds - int(seconds))*1000)
            return sprintf("%02d:%02d:%02d.%03d", h, m, s, ms)
        }

        {
            if (match($0, /^([0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{3}) --> ([0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{3})$/)) {
                start_time = time_to_seconds(substr($0, RSTART, RLENGTH/2-5))
                end_time = time_to_seconds(substr($0, RSTART+RLENGTH/2+5, RLENGTH/2-5))

                new_start = start_time + shift
                new_end = end_time + shift

                # Handle negative times
                if (new_start < 0) new_start = 0
                if (new_end < 0) new_end = 0

                print seconds_to_time(new_start)" --> "seconds_to_time(new_end)
            } else {
                print $0
            }
        }' "$input_file" > "$output_file"

    elif [ "$file_ext" = "txt" ]; then
        echo "🕒 Shifting timestamps in TXT by $shift_by seconds..."
        # For text files, we need to handle timestamps in formats like [00:05.440]
        awk -v shift=$shift_by '
        function time_to_seconds(time_str) {
            # Remove brackets
            gsub(/[\[\]]/, "", time_str)

            # Check format - either MM:SS.mmm or HH:MM:SS.mmm
            if (split(time_str, parts, ":") == 2) {
                # MM:SS.mmm format
                mm = parts[1]
                split(parts[2], sec_parts, ".")
                ss = sec_parts[1]
                ms = sec_parts[2] ? sec_parts[2] : 0
                return mm*60 + ss + ms/1000
            } else {
                # HH:MM:SS.mmm format
                hh = parts[1]
                mm = parts[2]
                split(parts[3], sec_parts, ".")
                ss = sec_parts[1]
                ms = sec_parts[2] ? sec_parts[2] : 0
                return hh*3600 + mm*60 + ss + ms/1000
            }
        }

        function seconds_to_time(seconds) {
            h = int(seconds/3600)
            m = int((seconds-h*3600)/60)
            s = seconds-h*3600-m*60
            # Format with up to 3 decimal places for milliseconds
            if (h > 0) {
                return sprintf("[%02d:%02d:%05.3f]", h, m, s)
            } else {
                return sprintf("[%02d:%05.3f]", m, s)
            }
        }

        {
            line = $0
            # Match timestamps in the format [MM:SS.mmm] or [HH:MM:SS.mmm]
            while (match(line, /\[[0-9]+:[0-9]+(\.[0-9]+)?\]/) || match(line, /\[[0-9]+:[0-9]+:[0-9]+(\.[0-9]+)?\]/)) {
                time_str = substr(line, RSTART, RLENGTH)
                time_sec = time_to_seconds(time_str)

                new_time = time_sec + shift
                if (new_time < 0) new_time = 0

                new_time_str = seconds_to_time(new_time)

                # Replace the timestamp
                line = substr(line, 1, RSTART-1) new_time_str substr(line, RSTART+RLENGTH)
            }
            print line
        }' "$input_file" > "$output_file"
    else
        echo "⚠️ Unsupported file extension for shifting: $file_ext"
        cp "$input_file" "$output_file"
    fi
}

# If we're only shifting timestamps, do that and exit
if [ "$SHIFT_ONLY" = "true" ]; then
    if [ -z "$INPUT_PREFIX" ]; then
        echo "❌ ERROR: When using SHIFT_ONLY=true, you must specify INPUT_PREFIX"
        exit 1
    fi

    echo "🕒 Performing timestamp shifting by $SHIFT_SECONDS seconds..."

    # Process each file type
    for ext in txt srt vtt; do
        # Check for regular transcript
        if [ -f "${INPUT_PREFIX}.${ext}" ]; then
            shift_timestamps "${INPUT_PREFIX}.${ext}" "${INPUT_PREFIX}_shifted.${ext}" $SHIFT_SECONDS
            echo "✅ Created ${INPUT_PREFIX}_shifted.${ext}"
        fi

        # Check for final transcript
        if [ -f "${INPUT_PREFIX}_final.${ext}" ]; then
            shift_timestamps "${INPUT_PREFIX}_final.${ext}" "${INPUT_PREFIX}_final_shifted.${ext}" $SHIFT_SECONDS
            echo "✅ Created ${INPUT_PREFIX}_final_shifted.${ext}"
        fi
    done

    echo "✅ Timestamp shifting complete!"
    exit 0
fi

# Generate random suffix
if command -v openssl > /dev/null 2>&1; then
    RANDOM_SUFFIX=$(openssl rand -hex 4)
elif command -v md5sum > /dev/null 2>&1; then
    RANDOM_SUFFIX=$(date +%s | md5sum | head -c 8)
elif command -v shasum > /dev/null 2>&1; then
    RANDOM_SUFFIX=$(date +%s | shasum | head -c 8)
else
    RANDOM_SUFFIX=$RANDOM$RANDOM
fi

AUDIO_FILE="${ORIGINAL_FILENAME}_${START_MIN}_${END_MIN}_${RANDOM_SUFFIX}.wav"
DIARIZATION_ENABLED=${DIARIZATION_ENABLED:-true}
HF_TOKEN=${HF_TOKEN:-""}
BUCKET_NAME=${BUCKET_NAME:-"whisper-video-transcripts"}

# Output file names with the same format
TRANSCRIPT_PREFIX="${ORIGINAL_FILENAME}_${START_MIN}_${END_MIN}_${RANDOM_SUFFIX}"
TRANSCRIPT_FILE="${TRANSCRIPT_PREFIX}.txt"
FINAL_TRANSCRIPT_FILE="${TRANSCRIPT_PREFIX}_final.txt"
SRT_FILE="${TRANSCRIPT_PREFIX}.srt"
VTT_FILE="${TRANSCRIPT_PREFIX}.vtt"

# === CONTROLLI PRELIMINARI ===
if [ ! -f "$KEY_FILE" ]; then
  echo "❌ Chiave SSH non trovata in $KEY_FILE"
  exit 1
fi
if [ ! -f "parallel_transcript.py" ]; then
  echo "❌ File parallel_transcript.py non trovato"
  exit 1
fi

if [ ! -f "$VIDEO_FILE" ]; then
  echo "❌ File video $VIDEO_FILE non trovato"
  exit 1
fi

# === CONVERTI MP4 IN WAV E APPLICA CROP PRIMA DELL'UPLOAD ===
echo "🎙️ Converto $VIDEO_FILE in $AUDIO_FILE con crop applicato..."
FFMPEG_CMD="ffmpeg -i \"$VIDEO_FILE\""

# Aggiungi parametri di crop se START_MIN o END_MIN sono impostati
if [ "$START_MIN" != "0" ] || [ "$END_MIN" != "0" ]; then
  START_SEC=$((START_MIN * 60))
  if [ "$END_MIN" != "0" ]; then
    END_SEC=$((END_MIN * 60))
    FFMPEG_CMD+=" -ss $START_SEC -to $END_SEC"
  else
    FFMPEG_CMD+=" -ss $START_SEC"
  fi
  echo "⏱️ Crop video da $START_MIN min a ${END_MIN:-fine} min"
fi

# Completa il comando ffmpeg con gli altri parametri necessari
FFMPEG_CMD+=" -ac 1 -ar 16000 -vn \"$AUDIO_FILE\" -y"

# Esegui il comando ffmpeg
eval $FFMPEG_CMD

echo "☁️ Controllo se l'audio è già presente su S3..."
AUDIO_UPLOADED=""
if ! aws s3 ls s3://$BUCKET_NAME/$AUDIO_FILE >/dev/null 2>&1; then
  echo "⬆️ Carico $AUDIO_FILE su S3..."
  aws s3 cp $AUDIO_FILE s3://$BUCKET_NAME/
  AUDIO_UPLOADED="true"
else
  echo "✅ Audio già presente su S3. Salto upload."
fi

# === CONTROLLA O CREA LA DEFAULT VPC ===
echo "🔍 Controllo default VPC nella regione $REGION..."
DEFAULT_VPC_ID=$(aws ec2 describe-vpcs --region $REGION --filters Name=isDefault,Values=true --query "Vpcs[0].VpcId" --output text)

if [ "$DEFAULT_VPC_ID" = "None" ]; then
  echo "➕ Nessuna default VPC trovata. La creo..."
  DEFAULT_VPC_ID=$(aws ec2 create-default-vpc --region $REGION --query "Vpc.VpcId" --output text)
  echo "✅ Default VPC creata: $DEFAULT_VPC_ID"
else
  echo "✅ Default VPC esistente: $DEFAULT_VPC_ID"
fi

# === CREA SECURITY GROUP SE NECESSARIO ===
aws ec2 describe-security-groups --group-names $SECURITY_GROUP --region $REGION &>/dev/null
if [ $? -ne 0 ]; then
  echo "➕ Creo security group $SECURITY_GROUP..."
  aws ec2 create-security-group --group-name $SECURITY_GROUP --description "Whisper SG" --vpc-id $DEFAULT_VPC_ID --region $REGION
  aws ec2 authorize-security-group-ingress --group-name $SECURITY_GROUP --protocol tcp --port 22 --cidr 0.0.0.0/0 --region $REGION
fi

# === AVVIA L'ISTANZA EC2 ===
echo "🚀 Avvio istanza EC2 GPU ($INSTANCE_TYPE con GPU)..."
INSTANCE_ID=$(aws ec2 run-instances \
  --image-id $AMI_ID \
  --instance-type $INSTANCE_TYPE \
  --key-name $KEY_NAME \
  --security-groups $SECURITY_GROUP \
  --iam-instance-profile Name=WhisperS3Profile \
  --block-device-mappings '[{"DeviceName":"/dev/sda1","Ebs":{"VolumeSize":50}}]' \
  --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=whisper-runner}]" \
  --region $REGION \
  --query "Instances[0].InstanceId" \
  --output text)

if [ -z "$INSTANCE_ID" ]; then
  echo "❌ ERRORE: ID istanza non ottenuto. Verifica che l'AMI sia corretta per la regione $REGION."
  exit 1
fi

echo "🆔 Istanza avviata: $INSTANCE_ID"

# === FUNZIONE DI CLEANUP IN CASO DI USCITA IMPROVVISA ===
function cleanup {
  echo "🧨 Cleanup in corso..."

  # Rimuove il file audio locale se esiste
  if [ -f "$AUDIO_FILE" ]; then
    echo "🧹 Rimuovo file audio locale $AUDIO_FILE..."
    rm -f "$AUDIO_FILE"
    echo "✅ File audio locale rimosso."
  fi

  # Rimuove l'audio da S3 se è stato caricato in questo script
  if [ "$AUDIO_UPLOADED" = "true" ]; then
    echo "🧹 Rimuovo $AUDIO_FILE da S3..."
    aws s3 rm s3://$BUCKET_NAME/$AUDIO_FILE
    echo "✅ File rimosso da S3."
  fi

  # Termina l'istanza EC2 se è stata avviata
  if [ -n "$INSTANCE_ID" ]; then
    echo "🧹 Termino l'istanza EC2 ($INSTANCE_ID)..."
    aws ec2 terminate-instances --instance-ids $INSTANCE_ID --region $REGION >/dev/null

    # Aspetta la terminazione con timeout
    echo "⏳ Aspetto la terminazione dell'istanza (max 60 secondi)..."
    WAIT_TIMEOUT=60
    WAIT_START=$(date +%s)

    WAITING=true
    while [ "$WAITING" = true ]; do
      # Controlla lo stato dell'istanza
      STATUS=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID --region $REGION --query "Reservations[0].Instances[0].State.Name" --output text 2>/dev/null)

      # Se lo stato è terminated o l'istanza non esiste più, esci dal ciclo
      if [ "$STATUS" = "terminated" ] || [ "$STATUS" = "None" ]; then
        echo "✅ Istanza terminata con successo."
        WAITING=false
      else
        # Controlla se è scaduto il timeout
        WAIT_ELAPSED=$(($(date +%s) - WAIT_START))
        if [ $WAIT_ELAPSED -ge $WAIT_TIMEOUT ]; then
          echo "⚠️ Timeout durante l'attesa della terminazione. L'istanza potrebbe essere ancora in fase di terminazione."
          WAITING=false
        else
          # Aspetta un secondo prima di controllare di nuovo
          sleep 2
          echo -n "."
        fi
      fi
    done
  fi
}

# Esegui cleanup su qualsiasi uscita: normale, errore, o Ctrl+C
trap cleanup EXIT

echo "⏳ Attendo che sia pronta..."
aws ec2 wait instance-running --instance-ids $INSTANCE_ID --region $REGION

echo "🔐 Aspetto che l'istanza sia pronta per SSH..."
for i in {1..35}; do
  PUBLIC_IP=$(aws ec2 describe-instances --instance-id $INSTANCE_ID --region $REGION --query "Reservations[0].Instances[0].PublicIpAddress" --output text)
  echo "🌍 IP pubblico: $PUBLIC_IP"

  nc -zv $PUBLIC_IP 22 >/dev/null 2>&1
  if [ $? -eq 0 ]; then
    echo "✅ Porta 22 aperta, l'istanza è pronta!"
    break
  else
    echo "⏳ Tentativo $i/35: porta 22 ancora chiusa. Riprovo tra 5s..."
    sleep 5
  fi
done

# === CARICA SCRIPT PYTHON SULL'ISTANZA ===
echo "📦 Carico script sulla macchina EC2..."
scp -o StrictHostKeyChecking=no -i $KEY_FILE parallel_transcript.py ubuntu@$PUBLIC_IP:/home/ubuntu/
scp -o StrictHostKeyChecking=no -i $KEY_FILE .env ubuntu@$PUBLIC_IP:/home/ubuntu/
scp -o StrictHostKeyChecking=no -i $KEY_FILE requirements.txt ubuntu@$PUBLIC_IP:/home/ubuntu/

echo "⚙️ Scarico audio da S3 ed eseguo trascrizione avanzata..."
ssh -t -i $KEY_FILE -o "SendEnv=TERM" ubuntu@$PUBLIC_IP "
  # Prevent broken pipe errors
  export PYTHONUNBUFFERED=1
  set -e
  cd /home/ubuntu

  echo '⬇️ Download da S3...'
  aws s3 cp s3://$BUCKET_NAME/$AUDIO_FILE /home/ubuntu/$AUDIO_FILE --region $REGION

  echo '📦 File scaricato:'
  ls -lh $AUDIO_FILE

  echo '⚙️ Attivo ambiente virtuale...'
  source whisper-env/bin/activate

  # Installa PyDub se non presente
  if ! pip list | grep -q pydub; then
    echo '📦 Installo dipendenze mancanti...'
    pip install pydub
  fi

  # Installa le dipendenze da requirements.txt
  pip install -r requirements.txt

  echo '🖥️ Informazioni GPU:'
  nvidia-smi

  echo 'Audio file: $AUDIO_FILE'
  echo 'Token Hugging Face: $HF_TOKEN'
  echo 'Diarization enabled: $DIARIZATION_ENABLED'
  echo 'Numero di speaker: $NUM_SPEAKERS'
  echo '✍️ Lancio trascrizione avanzata...'
  CMD=\"python3 parallel_transcript.py --audio $AUDIO_FILE --token $HF_TOKEN \
      --output-prefix $TRANSCRIPT_PREFIX\"

  if [ \"$DIARIZATION_ENABLED\" = false ]; then
    CMD+=\" --no-diarization\"
  fi

  if [ -n \"$NUM_SPEAKERS\" ]; then
    CMD+=\" --num-speakers $NUM_SPEAKERS\"
    echo '👥 Utilizzo numero di speaker specificato: $NUM_SPEAKERS'
  fi

  if [ \"$FIX_START\" = true ]; then
    CMD+=\" --fix-start\"
    echo '⏱️ Aggiunta correzione per i primi secondi'
  fi

  eval \$CMD
"

# === SCARICA I FILE ===
echo "⬇️ Scarico i file di output..."
scp -i $KEY_FILE ubuntu@$PUBLIC_IP:/home/ubuntu/${TRANSCRIPT_PREFIX}_final.txt . || echo "⚠️ Impossibile scaricare _final.txt (potrebbe non essere stato generato)"
scp -i $KEY_FILE ubuntu@$PUBLIC_IP:/home/ubuntu/${TRANSCRIPT_PREFIX}.txt . || echo "⚠️ Impossibile scaricare .txt"
scp -i $KEY_FILE ubuntu@$PUBLIC_IP:/home/ubuntu/${TRANSCRIPT_PREFIX}.srt . || echo "⚠️ Impossibile scaricare .srt"
scp -i $KEY_FILE ubuntu@$PUBLIC_IP:/home/ubuntu/${TRANSCRIPT_PREFIX}.vtt . || echo "⚠️ Impossibile scaricare .vtt"

# Scarica anche i file JSON con dati aggiuntivi per debugging
scp -i $KEY_FILE ubuntu@$PUBLIC_IP:/home/ubuntu/${TRANSCRIPT_PREFIX}.txt.words.json . 2>/dev/null || true
scp -i $KEY_FILE ubuntu@$PUBLIC_IP:/home/ubuntu/${TRANSCRIPT_PREFIX}_final.txt.diarization.json . 2>/dev/null || true
scp -i $KEY_FILE ubuntu@$PUBLIC_IP:/home/ubuntu/${TRANSCRIPT_PREFIX}_final.txt.overlaps.json . 2>/dev/null || true

echo "📄 File scaricati:"
ls -lh ${TRANSCRIPT_PREFIX}* 2>/dev/null || echo "⚠️ Nessun file trovato con il prefisso specificato"