Speech-to-Text Transcription

Automatic speech-to-text transcription for subtitles, captions, and content indexing

Automatically transcribe speech from your videos with advanced AI-powered speech recognition. Generate accurate transcripts with automatic language detection for subtitles, closed captions, accessibility, and content indexing.

Overview

VideoCascade provides high-quality speech-to-text transcription powered by state-of-the-art AI models. When enabled, the system:

Extracts audio from video at optimal quality (16kHz mono MP3)
Transcribes speech with automatic language detection
Returns transcript with full text, detected language, and downloadable transcript URL

Perfect for generating subtitles, enabling search, improving accessibility, and indexing video content.

Enabling Transcription

Set enableTranscription: true in your video processing request:

curl -X POST https://api.videocascade.com/v1/videos \
  -H "Authorization: Bearer vca_your_api_key" \
  -H "Content-Type: application/json" \
  -d '{
    "fileUrl": "https://example.com/video.mp4",
    "enableTranscription": true
  }'

const response = await fetch('https://api.videocascade.com/v1/videos', {
  method: 'POST',
  headers: {
    'Authorization': 'Bearer vca_your_api_key',
    'Content-Type': 'application/json',
  },
  body: JSON.stringify({
    fileUrl: 'https://example.com/video.mp4',
    enableTranscription: true,
  }),
});

const data = await response.json();
console.log(`Video ID: ${data.videoId}`);

import requests

response = requests.post(
    'https://api.videocascade.com/v1/videos',
    headers={
        'Authorization': 'Bearer vca_your_api_key',
        'Content-Type': 'application/json',
    },
    json={
        'fileUrl': 'https://example.com/video.mp4',
        'enableTranscription': True,
    }
)

data = response.json()
print(f"Video ID: {data['videoId']}")

interface VideoRequest {
  fileUrl: string;
  enableTranscription?: boolean;
}

const request: VideoRequest = {
fileUrl: 'https://example.com/video.mp4',
enableTranscription: true,
};

const response = await fetch('https://api.videocascade.com/v1/videos', {
method: 'POST',
headers: {
'Authorization': 'Bearer vca_your_api_key',
'Content-Type': 'application/json',
},
body: JSON.stringify(request),
});

const data = await response.json();
console.log(`Video ID: ${data.videoId}`);

Response Structure

The transcript data is included in the video object under the transcript field:

interface Transcript {
  status: 'pending' | 'processing' | 'completed' | 'failed';
  text?: string;           // Full transcript text
  language?: string;       // Detected language code (e.g., 'en', 'es', 'fr')
  transcriptUrl?: string;  // S3 URL to download full transcript
}

interface VideoResponse {
  videoId: string;
  status: 'queued' | 'running' | 'succeeded' | 'failed';
  progressPercent?: number;
  hasTranscript: boolean;  // true if transcription was requested
  transcript?: Transcript;
  // ... other fields
}

{
  "videoId": "v_abc12345",
  "status": "succeeded",
  "progressPercent": 100,
  "finalVideoUrl": "https://storage.example.com/videos/final.mp4",
  "hasTranscript": true,
  "transcript": {
    "status": "completed",
    "text": "Welcome to this tutorial on video processing. Today we'll cover how to automatically transcribe your videos using AI. First, let's discuss why transcription is important for accessibility and SEO...",
    "language": "en",
    "transcriptUrl": "https://storage.example.com/transcripts/v_abc12345.txt"
  },
  "enableTranscription": true,
  "createdAt": "2025-11-23T10:30:00Z",
  "lastUpdatedAt": "2025-11-23T10:32:15Z"
}

Transcript Status States

Status	Description
`pending`	Transcription queued but not started yet
`processing`	Currently extracting audio and transcribing
`completed`	Transcription finished successfully, text available
`failed`	Transcription failed (check `errorMessage` in video response)

Retrieving Transcripts

Option 1: Get Video Status

Retrieve transcript with video status:

const response = await fetch(
  `https://api.videocascade.com/v1/videos/v_abc12345`,
  {
    headers: {
      'Authorization': 'Bearer vca_your_api_key',
    },
  }
);

const video = await response.json();

if (video.hasTranscript && video.transcript.status === 'completed') {
console.log('Language:', video.transcript.language);
console.log('Transcript:', video.transcript.text);
console.log('Download URL:', video.transcript.transcriptUrl);
}

response = requests.get(
    'https://api.videocascade.com/v1/videos/v_abc12345',
    headers={'Authorization': 'Bearer vca_your_api_key'}
)

video = response.json()

if video.get('hasTranscript') and video['transcript']['status'] == 'completed':
    print(f"Language: {video['transcript']['language']}")
    print(f"Transcript: {video['transcript']['text']}")
    print(f"Download URL: {video['transcript']['transcriptUrl']}")

Option 2: Dedicated Transcript Endpoint

Retrieve only the transcript data:

curl -X GET https://api.videocascade.com/v1/videos/v_abc12345/transcript \
  -H "Authorization: Bearer vca_your_api_key"

const response = await fetch(
  'https://api.videocascade.com/v1/videos/v_abc12345/transcript',
  {
    headers: {
      'Authorization': 'Bearer vca_your_api_key',
    },
  }
);

const transcript = await response.json();
console.log(transcript);
// {
// "videoId": "v_abc12345",
// "text": "Welcome to this tutorial...",
// "language": "en",
// "transcriptUrl": "https://storage.example.com/transcripts/v_abc12345.txt",
// "createdAt": "2025-11-23T10:32:15Z"
// }

response = requests.get(
    'https://api.videocascade.com/v1/videos/v_abc12345/transcript',
    headers={'Authorization': 'Bearer vca_your_api_key'}
)

transcript = response.json()
print(transcript)

interface TranscriptResponse {
  videoId: string;
  text: string;
  language: string;
  transcriptUrl: string;
  createdAt: string;
}

const response = await fetch(
'https://api.videocascade.com/v1/videos/v_abc12345/transcript',
{
headers: {
'Authorization': 'Bearer vca_your_api_key',
},
}
);

const transcript: TranscriptResponse = await response.json();
console.log(transcript.text);

Supported Languages

We automatically detect and transcribes 50+ languages:

Language	Code	Language	Code
English	`en`	Spanish	`es`
French	`fr`	German	`de`
Italian	`it`	Portuguese	`pt`
Chinese	`zh`	Japanese	`ja`
Korean	`ko`	Arabic	`ar`
Hindi	`hi`	Turkish	`tr`
Polish	`pl`	Swedish	`sv`
Danish	`da`	Norwegian	`no`
Finnish	`fi`	Greek	`el`
and more

Automatic Detection: You don't need to specify the language - we automatically detects it and returns the language code in the response.

Use Cases

Generate Subtitles/Captions

Create SRT or VTT subtitle files from transcripts:

// Transcribe video
const response = await fetch('https://api.videocascade.com/v1/videos', {
  method: 'POST',
  headers: {
    'Authorization': 'Bearer vca_your_api_key',
    'Content-Type': 'application/json',
  },
  body: JSON.stringify({
    fileUrl: 'https://example.com/video.mp4',
    enableTranscription: true,
  }),
});

const { videoId } = await response.json();

// Wait for completion
const video = await waitForCompletion(videoId);

// Convert to SRT format
function convertToSRT(text, wordsPerSubtitle = 10) {
  const words = text.split(' ');
  let srt = '';
  let counter = 1;
  let time = 0;

  for (let i = 0; i < words.length; i += wordsPerSubtitle) {
    const subtitle = words.slice(i, i + wordsPerSubtitle).join(' ');
    const startTime = formatTime(time);
    time += 3; // 3 seconds per subtitle
    const endTime = formatTime(time);

    srt += `${counter}\n${startTime} --> ${endTime}\n${subtitle}\n\n`;
    counter++;
  }

  return srt;
}

function formatTime(seconds) {
  const h = Math.floor(seconds / 3600).toString().padStart(2, '0');
  const m = Math.floor((seconds % 3600) / 60).toString().padStart(2, '0');
  const s = (seconds % 60).toString().padStart(2, '0');
  return `${h}:${m}:${s},000`;
}

const srtContent = convertToSRT(video.transcript.text);
console.log(srtContent);

Video Content Search

Make video content searchable:

// Transcribe and index videos
async function indexVideoContent(videoUrl) {
  const response = await fetch('https://api.videocascade.com/v1/videos', {
    method: 'POST',
    headers: {
      Authorization: 'Bearer vca_your_api_key',
      'Content-Type': 'application/json',
    },
    body: JSON.stringify({
      fileUrl: videoUrl,
      enableTranscription: true,
      enableAiAnalysis: true, // Also get tags
    }),
  });

  const { videoId } = await response.json();
  const video = await waitForCompletion(videoId);

  // Index in search engine
  await searchEngine.index({
    id: videoId,
    transcript: video.transcript.text,
    language: video.transcript.language,
    tags: video.analysis.data.tags,
    // Full-text search
    searchableContent: video.transcript.text,
  });
}

// Users can now search video content by spoken words
const results = await searchEngine.search('tutorial video processing');

Accessibility Compliance

Provide transcripts for accessibility:

// Generate accessible video page
async function createAccessibleVideoPage(videoId) {
  const video = await getVideo(videoId);

  const html = `
    <!DOCTYPE html>
    <html lang="${video.transcript.language}">
    <head>
      <title>Video - ${video.videoName}</title>
    </head>
    <body>
      <video controls src="${video.finalVideoUrl}">
        <track kind="captions" src="${video.transcript.transcriptUrl}" />
      </video>

      <section>
        <h2>Transcript</h2>
        <p>${video.transcript.text}</p>
      </section>

      <section>
        <h2>Download Options</h2>
        <ul>
          <li><a href="${video.finalVideoUrl}">Download Video</a></li>
          <li><a href="${video.transcript.transcriptUrl}">Download Transcript</a></li>
        </ul>
      </section>
    </body>
    </html>
  `;

  return html;
}

Meeting Notes & Summaries

Auto-generate meeting notes from recordings:

// Transcribe meeting
const response = await fetch('https://api.videocascade.com/v1/videos', {
  method: 'POST',
  headers: {
    Authorization: 'Bearer vca_your_api_key',
    'Content-Type': 'application/json',
  },
  body: JSON.stringify({
    fileUrl: 'https://example.com/meeting-recording.mp4',
    enableTranscription: true,
    removeNoise: true, // Clean audio
    normalizeAudio: true, // Balance speaker volumes
  }),
});

const { videoId } = await response.json();
const video = await waitForCompletion(videoId);

// Extract action items using AI
const actionItems = await extractActionItems(video.transcript.text);

// Send meeting notes
await sendEmail({
  to: 'team@example.com',
  subject: 'Meeting Notes - Nov 23',
  body: `
    Full Transcript:
    ${video.transcript.text}

    Action Items:
    ${actionItems.map(item => `- ${item}`).join('\n')}

    Recording: ${video.finalVideoUrl}
    Transcript: ${video.transcript.transcriptUrl}
  `,
});

SEO Optimization

Improve video SEO with transcripts:

// Add transcript to page for search engines
async function renderVideoPage(videoId) {
  const video = await getVideo(videoId);

  return `
    <article>
      <h1>${video.videoName}</h1>

      <video controls src="${video.finalVideoUrl}"></video>

      <!-- Schema.org structured data for search engines -->
      <script type="application/ld+json">
      {
        "@context": "https://schema.org",
        "@type": "VideoObject",
        "name": "${video.videoName}",
        "description": "${video.analysis?.data?.summary || ''}",
        "contentUrl": "${video.finalVideoUrl}",
        "transcript": "${video.transcript.text}",
        "inLanguage": "${video.transcript.language}"
      }
      </script>

      <!-- Visible transcript for users and crawlers -->
      <section>
        <h2>Transcript</h2>
        <p>${video.transcript.text}</p>
      </section>
    </article>
  `;
}

Combining with Other Features

Transcription works alongside other video processing features:

// Full video processing pipeline
const response = await fetch('https://api.videocascade.com/v1/videos', {
  method: 'POST',
  headers: {
    'Authorization': 'Bearer vca_your_api_key',
    'Content-Type': 'application/json',
  },
  body: JSON.stringify({
    fileUrl: 'https://example.com/video.mp4',
    // AI features
    enableTranscription: true,   // Speech-to-text
    enableAiAnalysis: true,      // Visual analysis
    enableThumbnail: true,       // Generate thumbnail
    // Audio enhancement
    normalizeAudio: true,        // Consistent volume
    removeNoise: true,           // Clean background
    removeSilence: true,         // Remove pauses
    // Video processing
    aspectRatio: '16:9',         // Format for YouTube
    compressionQuality: 95,      // High quality
    // Webhook notification
    webhookUrl: 'https://yourapp.com/webhooks/video-complete'
  }),
});

# Full video processing pipeline
response = requests.post(
    'https://api.videocascade.com/v1/videos',
    headers={
        'Authorization': 'Bearer vca_your_api_key',
        'Content-Type': 'application/json',
    },
    json={
        'fileUrl': 'https://example.com/video.mp4',
        # AI features
        'enableTranscription': True,   # Speech-to-text
        'enableAiAnalysis': True,      # Visual analysis
        'enableThumbnail': True,       # Generate thumbnail
        # Audio enhancement
        'normalizeAudio': True,        # Consistent volume
        'removeNoise': True,           # Clean background
        'removeSilence': True,         # Remove pauses
        # Video processing
        'aspectRatio': '16:9',         # Format for YouTube
        'compressionQuality': 95,      # High quality
        # Webhook notification
        'webhookUrl': 'https://yourapp.com/webhooks/video-complete'
    }
)

Best Practices

1. Improve Audio Quality First

Clean audio produces better transcripts:

// ✅ Good: Clean audio before transcription
{
  fileUrl: 'https://example.com/video.mp4',
  enableTranscription: true,
  normalizeAudio: true,  // Consistent volume
  removeNoise: true,     // Remove background noise
}

// ❌ Less effective: Transcribe raw audio
{
  fileUrl: 'https://example.com/noisy-video.mp4',
  enableTranscription: true,
}

2. Use Webhooks for Long Videos

Don't poll - use webhooks for efficient notification:

// ✅ Good: Use webhook
{
  fileUrl: 'https://example.com/long-video.mp4',
  enableTranscription: true,
  webhookUrl: 'https://yourapp.com/webhooks/transcription-complete'
}

3. Store Transcript URL in Database

Save the transcript URL for easy retrieval:

// After transcription completes
await db.videos.update({
  id: videoId,
  transcriptText: video.transcript.text,
  transcriptUrl: video.transcript.transcriptUrl,
  language: video.transcript.language,
  hasTranscript: true,
});

// Later retrieval
const video = await db.videos.findOne({ id: videoId });
console.log(video.transcriptUrl);

4. Handle Multiple Languages

Support multilingual content:

const video = await getVideo(videoId);

// Display transcript in user's language
const displayLanguage = {
  en: 'English',
  es: 'Spanish',
  fr: 'French',
  de: 'German',
  // ... etc
}[video.transcript.language];

console.log(`Transcript available in ${displayLanguage}`);

5. Cache Transcripts

Avoid repeated API calls:

// Check cache first
const cached = await redis.get(`transcript:${videoId}`);
if (cached) {
  return JSON.parse(cached);
}

// Fetch and cache
const response = await fetch(
  `https://api.videocascade.com/v1/videos/${videoId}/transcript`,
  {
    headers: { Authorization: 'Bearer vca_your_api_key' },
  }
);

const transcript = await response.json();
await redis.set(
  `transcript:${videoId}`,
  JSON.stringify(transcript),
  'EX',
  86400 * 7 // Cache 7 days
);

Limitations & Constraints

Duration Limits

Maximum duration: 10 minutes
Recommended: Under 5 minutes for best results
Long videos: Consider splitting into segments

10 Minute Limit: Videos longer than 10 minutes will have transcription disabled automatically. Split long videos into segments for transcription.

Language Support

Best accuracy:

English, Spanish, French, German, Italian
Mandarin Chinese, Japanese, Korean
Portuguese, Dutch, Russian

Moderate accuracy:

Arabic, Hindi, Turkish, Polish
Most European languages

Limited accuracy:

Rare languages or dialects
Heavy accents or non-native speakers
Technical jargon or domain-specific terms

Audio Quality Requirements

Required:

Clear speech (not mumbled or whispered)
Minimal background noise
Single speaker preferred (multi-speaker works but may be less accurate)

Not suitable for:

Music transcription
Multiple overlapping speakers
Very noisy environments (construction, traffic)
Very quiet or distant speech

Error Handling

Handle common error scenarios:

async function transcribeVideoSafely(videoUrl) {
  try {
    const response = await fetch('https://api.videocascade.com/v1/videos', {
      method: 'POST',
      headers: {
        'Authorization': 'Bearer vca_your_api_key',
        'Content-Type': 'application/json',
      },
      body: JSON.stringify({
        fileUrl: videoUrl,
        enableTranscription: true,
        normalizeAudio: true,  // Improve audio quality
      }),
    });

    if (!response.ok) {
      throw new Error(`API error: ${response.status}`);
    }

    const data = await response.json();
    const videoId = data.videoId;

    // Wait for transcription
    const result = await waitForCompletion(videoId);

    if (!result.hasTranscript) {
      return {
        success: false,
        error: 'Transcription not available',
        text: '',
      };
    }

    if (result.transcript.status === 'failed') {
      return {
        success: false,
        error: result.errorMessage || 'Transcription failed',
        text: '',
      };
    }

    return {
      success: true,
      text: result.transcript.text,
      language: result.transcript.language,
      url: result.transcript.transcriptUrl,
    };

} catch (error) {
console.error('Error transcribing video:', error);
return {
success: false,
error: error.message,
text: '',
};
}
}

// Usage
const result = await transcribeVideoSafely(videoUrl);
if (result.success) {
console.log('Transcript:', result.text);
} else {
console.error('Failed:', result.error);
}

def transcribe_video_safely(video_url):
    """Transcribe video with error handling"""
    try:
        response = requests.post(
            'https://api.videocascade.com/v1/videos',
            headers={
                'Authorization': 'Bearer vca_your_api_key',
                'Content-Type': 'application/json',
            },
            json={
                'fileUrl': video_url,
                'enableTranscription': True,
                'normalizeAudio': True,  # Improve audio quality
            }
        )

        response.raise_for_status()
        data = response.json()
        video_id = data['videoId']

        # Wait for transcription
        result = wait_for_completion(video_id)

        if not result.get('hasTranscript'):
            return {
                'success': False,
                'error': 'Transcription not available',
                'text': ''
            }

        if result['transcript']['status'] == 'failed':
            return {
                'success': False,
                'error': result.get('errorMessage', 'Transcription failed'),
                'text': ''
            }

        return {
            'success': True,
            'text': result['transcript']['text'],
            'language': result['transcript']['language'],
            'url': result['transcript']['transcriptUrl']
        }

    except Exception as error:
        print(f"Error transcribing video: {error}")
        return {
            'success': False,
            'error': str(error),
            'text': ''
        }

# Usage
result = transcribe_video_safely(video_url)
if result['success']:
    print(f"Transcript: {result['text']}")
else:
    print(f"Failed: {result['error']}")

Common Error Messages

Error	Cause	Solution
"Audio file too large"	Extracted audio > 25MB	Use shorter video or lower quality source
"Failed to extract audio"	Video has no audio track	Ensure video contains audio
"Transcription timed out"	OpenAI API timeout (60s)	Retry or use shorter video
"OpenAI API key not configured"	Missing API key	Configure `OPENAI_API_KEY` environment variable
"Video does not have a transcript"	Transcription not enabled	Set `enableTranscription: true`