first commit

2025-05-23 14:45:22 +02:00 · 2025-05-23 14:45:22 +02:00 · b6d47982c7
commit b6d47982c7
8 changed files with 1353 additions and 0 deletions
--- a/whisper_parallel.sh
+++ b/whisper_parallel.sh
@ -0,0 +1,458 @@
+#!/bin/bash
+
+# Load environment variables from .env file
+if [ -f .env ]; then
+  echo "Loading environment variables from .env file..."
+  set -o allexport
+  source .env
+  set +o allexport
+else
+  echo "Warning: .env file not found. Using default values."
+fi
+
+# === CONFIGURAZIONE ===
+# These defaults will be used if not set in .env file
+KEY_NAME=${KEY_NAME:-"whisper-key"}
+KEY_FILE=${KEY_FILE:-"$HOME/.ssh/${KEY_NAME}.pem"}
+SECURITY_GROUP=${SECURITY_GROUP:-"whisper-sg"}
+INSTANCE_TYPE=${INSTANCE_TYPE:-"g4dn.12xlarge"}  # Default a 1 GPU per rispettare limiti vCPU
+REGION=${REGION:-"eu-south-1"}
+AMI_ID=${AMI_ID:-"ami-059603706d3734615"}
+VIDEO_FILE=${VIDEO_FILE:-"mio_video.mp4"}
+ORIGINAL_FILENAME=$(basename "$VIDEO_FILE" | cut -d. -f1)
+START_MIN=${START_MIN:-0}      # Default value if not set
+END_MIN=${END_MIN:-0}        # Default value if not set
+SHIFT_SECONDS=${SHIFT_SECONDS:-0}  # Shift timestamps by this many seconds
+SHIFT_ONLY=${SHIFT_ONLY:-false}   # Set to true to only perform shifting on existing files
+INPUT_PREFIX=${INPUT_PREFIX:-""}   # Prefix for input files when using SHIFT_ONLY
+GPU_COUNT=${GPU_COUNT:-1}      # Numero di GPU da utilizzare (default: 1)
+NUM_SPEAKERS=${NUM_SPEAKERS:-""}  # Numero di speaker se conosciuto (opzionale)
+FIX_START=${FIX_START:-"true"}  # Aggiunge silenzio all'inizio per catturare i primi secondi
+
+# === FUNZIONE PER SHIFT DEI TIMESTAMPS ===
+shift_timestamps() {
+    local input_file=$1
+    local output_file=$2
+    local shift_by=$3
+    local file_ext="${input_file##*.}"
+
+    if [ "$file_ext" = "srt" ]; then
+        echo "🕒 Shifting SRT timestamps by $shift_by seconds..."
+        # SRT format: 00:00:05,440 --> 00:00:08,300
+        awk -v shift=$shift_by '
+        function time_to_seconds(time_str) {
+            split(time_str, parts, ",")
+            split(parts[1], time_parts, ":")
+            return time_parts[1]*3600 + time_parts[2]*60 + time_parts[3] + parts[2]/1000
+        }
+
+        function seconds_to_time(seconds) {
+            h = int(seconds/3600)
+            m = int((seconds-h*3600)/60)
+            s = int(seconds-h*3600-m*60)
+            ms = int((seconds - int(seconds))*1000)
+            return sprintf("%02d:%02d:%02d,%03d", h, m, s, ms)
+        }
+
+        {
+            if (match($0, /^([0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3}) --> ([0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3})$/)) {
+                start_time = time_to_seconds(substr($0, RSTART, RLENGTH/2-5))
+                end_time = time_to_seconds(substr($0, RSTART+RLENGTH/2+5, RLENGTH/2-5))
+
+                new_start = start_time + shift
+                new_end = end_time + shift
+
+                # Handle negative times (not allowed in SRT)
+                if (new_start < 0) new_start = 0
+                if (new_end < 0) new_end = 0
+
+                print seconds_to_time(new_start)" --> "seconds_to_time(new_end)
+            } else {
+                print $0
+            }
+        }' "$input_file" > "$output_file"
+
+    elif [ "$file_ext" = "vtt" ]; then
+        echo "🕒 Shifting VTT timestamps by $shift_by seconds..."
+        # VTT format: 00:00:05.440 --> 00:00:08.300
+        awk -v shift=$shift_by '
+        function time_to_seconds(time_str) {
+            split(time_str, parts, ".")
+            split(parts[1], time_parts, ":")
+            return time_parts[1]*3600 + time_parts[2]*60 + time_parts[3] + parts[2]/1000
+        }
+
+        function seconds_to_time(seconds) {
+            h = int(seconds/3600)
+            m = int((seconds-h*3600)/60)
+            s = int(seconds-h*3600-m*60)
+            ms = int((seconds - int(seconds))*1000)
+            return sprintf("%02d:%02d:%02d.%03d", h, m, s, ms)
+        }
+
+        {
+            if (match($0, /^([0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{3}) --> ([0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{3})$/)) {
+                start_time = time_to_seconds(substr($0, RSTART, RLENGTH/2-5))
+                end_time = time_to_seconds(substr($0, RSTART+RLENGTH/2+5, RLENGTH/2-5))
+
+                new_start = start_time + shift
+                new_end = end_time + shift
+
+                # Handle negative times
+                if (new_start < 0) new_start = 0
+                if (new_end < 0) new_end = 0
+
+                print seconds_to_time(new_start)" --> "seconds_to_time(new_end)
+            } else {
+                print $0
+            }
+        }' "$input_file" > "$output_file"
+
+    elif [ "$file_ext" = "txt" ]; then
+        echo "🕒 Shifting timestamps in TXT by $shift_by seconds..."
+        # For text files, we need to handle timestamps in formats like [00:05.440]
+        awk -v shift=$shift_by '
+        function time_to_seconds(time_str) {
+            # Remove brackets
+            gsub(/[\[\]]/, "", time_str)
+
+            # Check format - either MM:SS.mmm or HH:MM:SS.mmm
+            if (split(time_str, parts, ":") == 2) {
+                # MM:SS.mmm format
+                mm = parts[1]
+                split(parts[2], sec_parts, ".")
+                ss = sec_parts[1]
+                ms = sec_parts[2] ? sec_parts[2] : 0
+                return mm*60 + ss + ms/1000
+            } else {
+                # HH:MM:SS.mmm format
+                hh = parts[1]
+                mm = parts[2]
+                split(parts[3], sec_parts, ".")
+                ss = sec_parts[1]
+                ms = sec_parts[2] ? sec_parts[2] : 0
+                return hh*3600 + mm*60 + ss + ms/1000
+            }
+        }
+
+        function seconds_to_time(seconds) {
+            h = int(seconds/3600)
+            m = int((seconds-h*3600)/60)
+            s = seconds-h*3600-m*60
+            # Format with up to 3 decimal places for milliseconds
+            if (h > 0) {
+                return sprintf("[%02d:%02d:%05.3f]", h, m, s)
+            } else {
+                return sprintf("[%02d:%05.3f]", m, s)
+            }
+        }
+
+        {
+            line = $0
+            # Match timestamps in the format [MM:SS.mmm] or [HH:MM:SS.mmm]
+            while (match(line, /\[[0-9]+:[0-9]+(\.[0-9]+)?\]/) || match(line, /\[[0-9]+:[0-9]+:[0-9]+(\.[0-9]+)?\]/)) {
+                time_str = substr(line, RSTART, RLENGTH)
+                time_sec = time_to_seconds(time_str)
+
+                new_time = time_sec + shift
+                if (new_time < 0) new_time = 0
+
+                new_time_str = seconds_to_time(new_time)
+
+                # Replace the timestamp
+                line = substr(line, 1, RSTART-1) new_time_str substr(line, RSTART+RLENGTH)
+            }
+            print line
+        }' "$input_file" > "$output_file"
+    else
+        echo "⚠️ Unsupported file extension for shifting: $file_ext"
+        cp "$input_file" "$output_file"
+    fi
+}
+
+# If we're only shifting timestamps, do that and exit
+if [ "$SHIFT_ONLY" = "true" ]; then
+    if [ -z "$INPUT_PREFIX" ]; then
+        echo "❌ ERROR: When using SHIFT_ONLY=true, you must specify INPUT_PREFIX"
+        exit 1
+    fi
+
+    echo "🕒 Performing timestamp shifting by $SHIFT_SECONDS seconds..."
+
+    # Process each file type
+    for ext in txt srt vtt; do
+        # Check for regular transcript
+        if [ -f "${INPUT_PREFIX}.${ext}" ]; then
+            shift_timestamps "${INPUT_PREFIX}.${ext}" "${INPUT_PREFIX}_shifted.${ext}" $SHIFT_SECONDS
+            echo "✅ Created ${INPUT_PREFIX}_shifted.${ext}"
+        fi
+
+        # Check for final transcript
+        if [ -f "${INPUT_PREFIX}_final.${ext}" ]; then
+            shift_timestamps "${INPUT_PREFIX}_final.${ext}" "${INPUT_PREFIX}_final_shifted.${ext}" $SHIFT_SECONDS
+            echo "✅ Created ${INPUT_PREFIX}_final_shifted.${ext}"
+        fi
+    done
+
+    echo "✅ Timestamp shifting complete!"
+    exit 0
+fi
+
+# Generate random suffix
+if command -v openssl > /dev/null 2>&1; then
+    RANDOM_SUFFIX=$(openssl rand -hex 4)
+elif command -v md5sum > /dev/null 2>&1; then
+    RANDOM_SUFFIX=$(date +%s | md5sum | head -c 8)
+elif command -v shasum > /dev/null 2>&1; then
+    RANDOM_SUFFIX=$(date +%s | shasum | head -c 8)
+else
+    RANDOM_SUFFIX=$RANDOM$RANDOM
+fi
+
+AUDIO_FILE="${ORIGINAL_FILENAME}_${START_MIN}_${END_MIN}_${RANDOM_SUFFIX}.wav"
+DIARIZATION_ENABLED=${DIARIZATION_ENABLED:-true}
+HF_TOKEN=${HF_TOKEN:-""}
+BUCKET_NAME=${BUCKET_NAME:-"whisper-video-transcripts"}
+
+# Output file names with the same format
+TRANSCRIPT_PREFIX="${ORIGINAL_FILENAME}_${START_MIN}_${END_MIN}_${RANDOM_SUFFIX}"
+TRANSCRIPT_FILE="${TRANSCRIPT_PREFIX}.txt"
+FINAL_TRANSCRIPT_FILE="${TRANSCRIPT_PREFIX}_final.txt"
+SRT_FILE="${TRANSCRIPT_PREFIX}.srt"
+VTT_FILE="${TRANSCRIPT_PREFIX}.vtt"
+
+# === CONTROLLI PRELIMINARI ===
+if [ ! -f "$KEY_FILE" ]; then
+  echo "❌ Chiave SSH non trovata in $KEY_FILE"
+  exit 1
+fi
+if [ ! -f "parallel_transcript.py" ]; then
+  echo "❌ File parallel_transcript.py non trovato"
+  exit 1
+fi
+
+if [ ! -f "$VIDEO_FILE" ]; then
+  echo "❌ File video $VIDEO_FILE non trovato"
+  exit 1
+fi
+
+# === CONVERTI MP4 IN WAV E APPLICA CROP PRIMA DELL'UPLOAD ===
+echo "🎙️ Converto $VIDEO_FILE in $AUDIO_FILE con crop applicato..."
+FFMPEG_CMD="ffmpeg -i \"$VIDEO_FILE\""
+
+# Aggiungi parametri di crop se START_MIN o END_MIN sono impostati
+if [ "$START_MIN" != "0" ] || [ "$END_MIN" != "0" ]; then
+  START_SEC=$((START_MIN * 60))
+  if [ "$END_MIN" != "0" ]; then
+    END_SEC=$((END_MIN * 60))
+    FFMPEG_CMD+=" -ss $START_SEC -to $END_SEC"
+  else
+    FFMPEG_CMD+=" -ss $START_SEC"
+  fi
+  echo "⏱️ Crop video da $START_MIN min a ${END_MIN:-fine} min"
+fi
+
+# Completa il comando ffmpeg con gli altri parametri necessari
+FFMPEG_CMD+=" -ac 1 -ar 16000 -vn \"$AUDIO_FILE\" -y"
+
+# Esegui il comando ffmpeg
+eval $FFMPEG_CMD
+
+echo "☁️ Controllo se l'audio è già presente su S3..."
+AUDIO_UPLOADED=""
+if ! aws s3 ls s3://$BUCKET_NAME/$AUDIO_FILE >/dev/null 2>&1; then
+  echo "⬆️ Carico $AUDIO_FILE su S3..."
+  aws s3 cp $AUDIO_FILE s3://$BUCKET_NAME/
+  AUDIO_UPLOADED="true"
+else
+  echo "✅ Audio già presente su S3. Salto upload."
+fi
+
+# === CONTROLLA O CREA LA DEFAULT VPC ===
+echo "🔍 Controllo default VPC nella regione $REGION..."
+DEFAULT_VPC_ID=$(aws ec2 describe-vpcs --region $REGION --filters Name=isDefault,Values=true --query "Vpcs[0].VpcId" --output text)
+
+if [ "$DEFAULT_VPC_ID" = "None" ]; then
+  echo "➕ Nessuna default VPC trovata. La creo..."
+  DEFAULT_VPC_ID=$(aws ec2 create-default-vpc --region $REGION --query "Vpc.VpcId" --output text)
+  echo "✅ Default VPC creata: $DEFAULT_VPC_ID"
+else
+  echo "✅ Default VPC esistente: $DEFAULT_VPC_ID"
+fi
+
+# === CREA SECURITY GROUP SE NECESSARIO ===
+aws ec2 describe-security-groups --group-names $SECURITY_GROUP --region $REGION &>/dev/null
+if [ $? -ne 0 ]; then
+  echo "➕ Creo security group $SECURITY_GROUP..."
+  aws ec2 create-security-group --group-name $SECURITY_GROUP --description "Whisper SG" --vpc-id $DEFAULT_VPC_ID --region $REGION
+  aws ec2 authorize-security-group-ingress --group-name $SECURITY_GROUP --protocol tcp --port 22 --cidr 0.0.0.0/0 --region $REGION
+fi
+
+# === AVVIA L'ISTANZA EC2 ===
+echo "🚀 Avvio istanza EC2 GPU ($INSTANCE_TYPE con GPU)..."
+INSTANCE_ID=$(aws ec2 run-instances \
+  --image-id $AMI_ID \
+  --instance-type $INSTANCE_TYPE \
+  --key-name $KEY_NAME \
+  --security-groups $SECURITY_GROUP \
+  --iam-instance-profile Name=WhisperS3Profile \
+  --block-device-mappings '[{"DeviceName":"/dev/sda1","Ebs":{"VolumeSize":50}}]' \
+  --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=whisper-runner}]" \
+  --region $REGION \
+  --query "Instances[0].InstanceId" \
+  --output text)
+
+if [ -z "$INSTANCE_ID" ]; then
+  echo "❌ ERRORE: ID istanza non ottenuto. Verifica che l'AMI sia corretta per la regione $REGION."
+  exit 1
+fi
+
+echo "🆔 Istanza avviata: $INSTANCE_ID"
+
+# === FUNZIONE DI CLEANUP IN CASO DI USCITA IMPROVVISA ===
+function cleanup {
+  echo "🧨 Cleanup in corso..."
+
+  # Rimuove il file audio locale se esiste
+  if [ -f "$AUDIO_FILE" ]; then
+    echo "🧹 Rimuovo file audio locale $AUDIO_FILE..."
+    rm -f "$AUDIO_FILE"
+    echo "✅ File audio locale rimosso."
+  fi
+
+  # Rimuove l'audio da S3 se è stato caricato in questo script
+  if [ "$AUDIO_UPLOADED" = "true" ]; then
+    echo "🧹 Rimuovo $AUDIO_FILE da S3..."
+    aws s3 rm s3://$BUCKET_NAME/$AUDIO_FILE
+    echo "✅ File rimosso da S3."
+  fi
+
+  # Termina l'istanza EC2 se è stata avviata
+  if [ -n "$INSTANCE_ID" ]; then
+    echo "🧹 Termino l'istanza EC2 ($INSTANCE_ID)..."
+    aws ec2 terminate-instances --instance-ids $INSTANCE_ID --region $REGION >/dev/null
+
+    # Aspetta la terminazione con timeout
+    echo "⏳ Aspetto la terminazione dell'istanza (max 60 secondi)..."
+    WAIT_TIMEOUT=60
+    WAIT_START=$(date +%s)
+
+    WAITING=true
+    while [ "$WAITING" = true ]; do
+      # Controlla lo stato dell'istanza
+      STATUS=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID --region $REGION --query "Reservations[0].Instances[0].State.Name" --output text 2>/dev/null)
+
+      # Se lo stato è terminated o l'istanza non esiste più, esci dal ciclo
+      if [ "$STATUS" = "terminated" ] || [ "$STATUS" = "None" ]; then
+        echo "✅ Istanza terminata con successo."
+        WAITING=false
+      else
+        # Controlla se è scaduto il timeout
+        WAIT_ELAPSED=$(($(date +%s) - WAIT_START))
+        if [ $WAIT_ELAPSED -ge $WAIT_TIMEOUT ]; then
+          echo "⚠️ Timeout durante l'attesa della terminazione. L'istanza potrebbe essere ancora in fase di terminazione."
+          WAITING=false
+        else
+          # Aspetta un secondo prima di controllare di nuovo
+          sleep 2
+          echo -n "."
+        fi
+      fi
+    done
+  fi
+}
+
+# Esegui cleanup su qualsiasi uscita: normale, errore, o Ctrl+C
+trap cleanup EXIT
+
+echo "⏳ Attendo che sia pronta..."
+aws ec2 wait instance-running --instance-ids $INSTANCE_ID --region $REGION
+
+echo "🔐 Aspetto che l'istanza sia pronta per SSH..."
+for i in {1..35}; do
+  PUBLIC_IP=$(aws ec2 describe-instances --instance-id $INSTANCE_ID --region $REGION --query "Reservations[0].Instances[0].PublicIpAddress" --output text)
+  echo "🌍 IP pubblico: $PUBLIC_IP"
+
+  nc -zv $PUBLIC_IP 22 >/dev/null 2>&1
+  if [ $? -eq 0 ]; then
+    echo "✅ Porta 22 aperta, l'istanza è pronta!"
+    break
+  else
+    echo "⏳ Tentativo $i/35: porta 22 ancora chiusa. Riprovo tra 5s..."
+    sleep 5
+  fi
+done
+
+# === CARICA SCRIPT PYTHON SULL'ISTANZA ===
+echo "📦 Carico script sulla macchina EC2..."
+scp -o StrictHostKeyChecking=no -i $KEY_FILE parallel_transcript.py ubuntu@$PUBLIC_IP:/home/ubuntu/
+scp -o StrictHostKeyChecking=no -i $KEY_FILE .env ubuntu@$PUBLIC_IP:/home/ubuntu/
+scp -o StrictHostKeyChecking=no -i $KEY_FILE requirements.txt ubuntu@$PUBLIC_IP:/home/ubuntu/
+
+echo "⚙️ Scarico audio da S3 ed eseguo trascrizione avanzata..."
+ssh -t -i $KEY_FILE -o "SendEnv=TERM" ubuntu@$PUBLIC_IP "
+  # Prevent broken pipe errors
+  export PYTHONUNBUFFERED=1
+  set -e
+  cd /home/ubuntu
+
+  echo '⬇️ Download da S3...'
+  aws s3 cp s3://$BUCKET_NAME/$AUDIO_FILE /home/ubuntu/$AUDIO_FILE --region $REGION
+
+  echo '📦 File scaricato:'
+  ls -lh $AUDIO_FILE
+
+  echo '⚙️ Attivo ambiente virtuale...'
+  source whisper-env/bin/activate
+
+  # Installa PyDub se non presente
+  if ! pip list | grep -q pydub; then
+    echo '📦 Installo dipendenze mancanti...'
+    pip install pydub
+  fi
+
+  # Installa le dipendenze da requirements.txt
+  pip install -r requirements.txt
+
+  echo '🖥️ Informazioni GPU:'
+  nvidia-smi
+
+  echo 'Audio file: $AUDIO_FILE'
+  echo 'Token Hugging Face: $HF_TOKEN'
+  echo 'Diarization enabled: $DIARIZATION_ENABLED'
+  echo 'Numero di speaker: $NUM_SPEAKERS'
+  echo '✍️ Lancio trascrizione avanzata...'
+  CMD=\"python3 parallel_transcript.py --audio $AUDIO_FILE --token $HF_TOKEN \
+      --output-prefix $TRANSCRIPT_PREFIX\"
+
+  if [ \"$DIARIZATION_ENABLED\" = false ]; then
+    CMD+=\" --no-diarization\"
+  fi
+
+  if [ -n \"$NUM_SPEAKERS\" ]; then
+    CMD+=\" --num-speakers $NUM_SPEAKERS\"
+    echo '👥 Utilizzo numero di speaker specificato: $NUM_SPEAKERS'
+  fi
+
+  if [ \"$FIX_START\" = true ]; then
+    CMD+=\" --fix-start\"
+    echo '⏱️ Aggiunta correzione per i primi secondi'
+  fi
+
+  eval \$CMD
+"
+
+# === SCARICA I FILE ===
+echo "⬇️ Scarico i file di output..."
+scp -i $KEY_FILE ubuntu@$PUBLIC_IP:/home/ubuntu/${TRANSCRIPT_PREFIX}_final.txt . || echo "⚠️ Impossibile scaricare _final.txt (potrebbe non essere stato generato)"
+scp -i $KEY_FILE ubuntu@$PUBLIC_IP:/home/ubuntu/${TRANSCRIPT_PREFIX}.txt . || echo "⚠️ Impossibile scaricare .txt"
+scp -i $KEY_FILE ubuntu@$PUBLIC_IP:/home/ubuntu/${TRANSCRIPT_PREFIX}.srt . || echo "⚠️ Impossibile scaricare .srt"
+scp -i $KEY_FILE ubuntu@$PUBLIC_IP:/home/ubuntu/${TRANSCRIPT_PREFIX}.vtt . || echo "⚠️ Impossibile scaricare .vtt"
+
+# Scarica anche i file JSON con dati aggiuntivi per debugging
+scp -i $KEY_FILE ubuntu@$PUBLIC_IP:/home/ubuntu/${TRANSCRIPT_PREFIX}.txt.words.json . 2>/dev/null || true
+scp -i $KEY_FILE ubuntu@$PUBLIC_IP:/home/ubuntu/${TRANSCRIPT_PREFIX}_final.txt.diarization.json . 2>/dev/null || true
+scp -i $KEY_FILE ubuntu@$PUBLIC_IP:/home/ubuntu/${TRANSCRIPT_PREFIX}_final.txt.overlaps.json . 2>/dev/null || true
+
+echo "📄 File scaricati:"
+ls -lh ${TRANSCRIPT_PREFIX}* 2>/dev/null || echo "⚠️ Nessun file trovato con il prefisso specificato"