Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import sys | |
| import json | |
| import time | |
| from typing import List, Tuple, Optional | |
| import numpy as np | |
| import librosa | |
| from pathlib import Path | |
| # Add src to path for imports | |
| sys.path.append(os.path.join(os.path.dirname(__file__), '..')) | |
| from models.speech_to_text import FreeIndianSpeechToText | |
| from utils.config import load_config | |
| from utils.audio_utils import AudioProcessor | |
| class GradioSpeechToTextApp: | |
| """ | |
| Gradio web interface for Indian Speech-to-Text models. | |
| Provides an intuitive UI for testing different models and languages. | |
| """ | |
| def __init__(self): | |
| self.config = load_config() | |
| self.current_model = None | |
| self.audio_processor = AudioProcessor() | |
| self.supported_languages = { | |
| "Hindi": "hi", | |
| "Tamil": "ta", | |
| "Bengali": "bn", | |
| "Telugu": "te", | |
| "Marathi": "mr", | |
| "Gujarati": "gu", | |
| "Kannada": "kn", | |
| "Malayalam": "ml", | |
| "Punjabi": "pa", | |
| "Odia": "or", | |
| "Assamese": "as", | |
| "Urdu": "ur", | |
| "English": "en" | |
| } | |
| # Initialize with default model | |
| self.initialize_model() | |
| def initialize_model(self): | |
| """Initialize the default model.""" | |
| try: | |
| default_model = self.config.get("DEFAULT_MODEL", "distil-whisper") | |
| self.current_model = FreeIndianSpeechToText( | |
| model_type=default_model, | |
| cache_dir=self.config.get("MODEL_CACHE_DIR", "./models") | |
| ) | |
| return f"β Initialized with {default_model} model" | |
| except Exception as e: | |
| return f"β Error initializing model: {str(e)}" | |
| def transcribe_audio(self, audio_input, model_choice: str, language_choice: str, | |
| enable_preprocessing: bool = True) -> Tuple[str, str, str]: | |
| """ | |
| Main transcription function for Gradio interface. | |
| Returns: | |
| Tuple of (transcription_text, model_info, processing_stats) | |
| """ | |
| if audio_input is None: | |
| return "β No audio provided", "", "" | |
| try: | |
| # Switch model if needed | |
| if not self.current_model or self.current_model.current_model_name != model_choice: | |
| status = self.switch_model(model_choice) | |
| if not status.startswith("β "): | |
| return f"β Model loading failed: {status}", "", "" | |
| # Get language code | |
| language_code = self.supported_languages.get(language_choice, "hi") | |
| # Preprocess audio if enabled | |
| if enable_preprocessing: | |
| try: | |
| audio_data = self.audio_processor.preprocess_audio(audio_input) | |
| except Exception as e: | |
| # Fallback to original audio if preprocessing fails | |
| audio_data = audio_input | |
| print(f"Preprocessing failed, using original: {e}") | |
| else: | |
| audio_data = audio_input | |
| # Perform transcription | |
| start_time = time.time() | |
| result = self.current_model.transcribe(audio_data, language_code) | |
| end_time = time.time() | |
| if result["success"]: | |
| # Format results | |
| transcription = result["text"] | |
| # Model information | |
| model_info = self.format_model_info(result) | |
| # Processing statistics | |
| processing_stats = self.format_processing_stats(result, end_time - start_time) | |
| return transcription, model_info, processing_stats | |
| else: | |
| return f"β Transcription failed: {result.get('error', 'Unknown error')}", "", "" | |
| except Exception as e: | |
| return f"β Error during transcription: {str(e)}", "", "" | |
| def switch_model(self, model_name: str) -> str: | |
| """Switch to a different model.""" | |
| try: | |
| if self.current_model: | |
| success = self.current_model.switch_model(model_name) | |
| if success: | |
| return f"β Switched to {model_name}" | |
| else: | |
| return f"β Failed to switch to {model_name}" | |
| else: | |
| self.current_model = FreeIndianSpeechToText( | |
| model_type=model_name, | |
| cache_dir=self.config.get("MODEL_CACHE_DIR", "./models") | |
| ) | |
| return f"β Loaded {model_name}" | |
| except Exception as e: | |
| return f"β Error switching model: {str(e)}" | |
| def batch_transcribe(self, files: List, model_choice: str, language_choice: str) -> str: | |
| """Batch transcription for multiple files.""" | |
| if not files: | |
| return "β No files provided" | |
| try: | |
| # Switch model if needed | |
| if not self.current_model or self.current_model.current_model_name != model_choice: | |
| status = self.switch_model(model_choice) | |
| if not status.startswith("β "): | |
| return f"β Model loading failed: {status}" | |
| language_code = self.supported_languages.get(language_choice, "hi") | |
| # Process files | |
| file_paths = [file.name for file in files] | |
| results = self.current_model.batch_transcribe(file_paths, language_code) | |
| # Format results | |
| output = "# Batch Transcription Results\n\n" | |
| for i, result in enumerate(results, 1): | |
| if result["success"]: | |
| output += f"## File {i}: {Path(result['file']).name}\n" | |
| output += f"**Transcription:** {result['text']}\n" | |
| output += f"**Processing Time:** {result.get('processing_time', 0):.2f}s\n\n" | |
| else: | |
| output += f"## File {i}: {Path(result['file']).name}\n" | |
| output += f"**Error:** {result.get('error', 'Unknown error')}\n\n" | |
| return output | |
| except Exception as e: | |
| return f"β Batch processing error: {str(e)}" | |
| def get_model_comparison(self) -> str: | |
| """Generate model comparison table.""" | |
| if not self.current_model: | |
| return "β No model loaded" | |
| models = self.current_model.get_available_models() | |
| comparison = "# Available Models Comparison\n\n" | |
| comparison += "| Model | Type | Size | Languages | Description |\n" | |
| comparison += "|-------|------|------|-----------|-------------|\n" | |
| for name, config in models.items(): | |
| comparison += f"| {name} | {config['type']} | {config['size']} | {config['languages']} | {config['description']} |\n" | |
| return comparison | |
| def format_model_info(self, result: dict) -> str: | |
| """Format model information for display.""" | |
| model_info = f""" | |
| **Model:** {result['model']} | |
| **Device:** {result['device']} | |
| **Language:** {result['language']} | |
| """ | |
| return model_info.strip() | |
| def format_processing_stats(self, result: dict, total_time: float) -> str: | |
| """Format processing statistics.""" | |
| stats = f""" | |
| **Total Processing Time:** {total_time:.2f}s | |
| **Model Processing Time:** {result.get('processing_time', 0):.2f}s | |
| **Status:** {'β Success' if result['success'] else 'β Failed'} | |
| """ | |
| return stats.strip() | |
| def create_interface(self) -> gr.Blocks: | |
| """Create the Gradio interface.""" | |
| # Custom CSS for better styling | |
| css = """ | |
| .gradio-container { | |
| font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; | |
| } | |
| .model-info { | |
| background-color: #f0f8ff; | |
| padding: 10px; | |
| border-radius: 5px; | |
| border-left: 4px solid #4CAF50; | |
| } | |
| .stats-info { | |
| background-color: #fff8f0; | |
| padding: 10px; | |
| border-radius: 5px; | |
| border-left: 4px solid #ff9800; | |
| } | |
| """ | |
| with gr.Blocks(css=css, title="Indian Speech-to-Text Models", theme=gr.themes.Soft()) as interface: | |
| gr.Markdown(""" | |
| # π€ Complete Guide to Free Open-Source Speech-to-Text Models for Indian Languages | |
| This application provides access to multiple free, open-source speech-to-text models optimized for Indian languages. | |
| All models are completely free to use and can be deployed commercially. | |
| """) | |
| with gr.Tab("π― Single Audio Transcription"): | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| # Audio input | |
| audio_input = gr.Audio( | |
| label="Upload Audio File or Record", | |
| type="filepath", | |
| sources=["upload", "microphone"] | |
| ) | |
| # Model selection | |
| model_choice = gr.Dropdown( | |
| choices=[ | |
| "distil-whisper", "whisper-free", "whisper-small", | |
| "wav2vec2-hindi", "wav2vec2-improved", "wav2vec2-multilang", | |
| "seamless", "speecht5" | |
| ], | |
| value="distil-whisper", | |
| label="Select Model", | |
| info="Choose the speech-to-text model" | |
| ) | |
| # Language selection | |
| language_choice = gr.Dropdown( | |
| choices=list(self.supported_languages.keys()), | |
| value="Hindi", | |
| label="Select Language", | |
| info="Choose the audio language" | |
| ) | |
| # Preprocessing option | |
| enable_preprocessing = gr.Checkbox( | |
| value=True, | |
| label="Enable Audio Preprocessing", | |
| info="Normalize and clean audio for better results" | |
| ) | |
| # Transcribe button | |
| transcribe_btn = gr.Button("π― Transcribe Audio", variant="primary", size="lg") | |
| with gr.Column(scale=3): | |
| # Results | |
| transcription_output = gr.Textbox( | |
| label="Transcription Result", | |
| lines=6, | |
| placeholder="Transcription will appear here..." | |
| ) | |
| with gr.Row(): | |
| model_info_output = gr.Markdown( | |
| label="Model Information", | |
| elem_classes=["model-info"] | |
| ) | |
| processing_stats = gr.Markdown( | |
| label="Processing Statistics", | |
| elem_classes=["stats-info"] | |
| ) | |
| with gr.Tab("π Batch Processing"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| # File upload for batch processing | |
| batch_files = gr.File( | |
| label="Upload Multiple Audio Files", | |
| file_count="multiple", | |
| file_types=["audio"] | |
| ) | |
| # Model and language for batch | |
| batch_model = gr.Dropdown( | |
| choices=[ | |
| "distil-whisper", "whisper-free", "whisper-small", | |
| "wav2vec2-hindi", "wav2vec2-improved" | |
| ], | |
| value="distil-whisper", | |
| label="Select Model for Batch Processing" | |
| ) | |
| batch_language = gr.Dropdown( | |
| choices=list(self.supported_languages.keys()), | |
| value="Hindi", | |
| label="Select Language for All Files" | |
| ) | |
| batch_btn = gr.Button("π Process Batch", variant="primary") | |
| with gr.Column(): | |
| batch_results = gr.Markdown( | |
| label="Batch Results", | |
| value="Upload files and click 'Process Batch' to see results here." | |
| ) | |
| with gr.Tab("π Model Comparison"): | |
| gr.Markdown("## Model Performance Comparison") | |
| comparison_btn = gr.Button("π Generate Comparison Table") | |
| comparison_output = gr.Markdown() | |
| gr.Markdown(""" | |
| ### Model Recommendations: | |
| - **Distil-Whisper Large-v3**: Best overall choice - 6x faster, 49% smaller, <1% WER difference | |
| - **OpenAI Whisper Large-v3**: Best accuracy for complex audio | |
| - **Wav2Vec2 Hindi Models**: Specialized for Hindi language | |
| - **Whisper Small**: Good balance for CPU-only deployment | |
| - **SeamlessM4T**: Best for multilingual scenarios (101 languages) | |
| """) | |
| with gr.Tab("βΉοΈ About & Setup"): | |
| gr.Markdown(""" | |
| ## About This Application | |
| This application showcases free, open-source speech-to-text models specifically optimized for Indian languages. | |
| All models are available under permissive licenses (MIT, Apache 2.0) and can be used commercially. | |
| ### Supported Languages: | |
| - Hindi, Tamil, Bengali, Telugu, Marathi | |
| - Gujarati, Kannada, Malayalam, Punjabi, Odia | |
| - Assamese, Urdu, English | |
| ### Key Features: | |
| - β Multiple free model architectures | |
| - β Real-time and batch processing | |
| - β Audio preprocessing and optimization | |
| - β Performance metrics and comparison | |
| - β Commercial use allowed | |
| ### Technical Stack: | |
| - **Models**: Transformers, PyTorch, TensorFlow | |
| - **Interface**: Gradio | |
| - **Audio Processing**: Librosa, SoundFile | |
| - **Optimization**: CUDA support, Mixed precision | |
| ### Setup Instructions: | |
| 1. Install dependencies: `pip install -r requirements.txt` | |
| 2. Set environment: `export APP_ENV=local` | |
| 3. Run application: `python app.py` | |
| """) | |
| # Event handlers | |
| transcribe_btn.click( | |
| fn=self.transcribe_audio, | |
| inputs=[audio_input, model_choice, language_choice, enable_preprocessing], | |
| outputs=[transcription_output, model_info_output, processing_stats] | |
| ) | |
| batch_btn.click( | |
| fn=self.batch_transcribe, | |
| inputs=[batch_files, batch_model, batch_language], | |
| outputs=[batch_results] | |
| ) | |
| comparison_btn.click( | |
| fn=self.get_model_comparison, | |
| outputs=[comparison_output] | |
| ) | |
| return interface | |
| def launch(self, share: bool = None, server_name: str = None, server_port: int = None): | |
| """Launch the Gradio application.""" | |
| interface = self.create_interface() | |
| # Use config values or defaults | |
| share = share if share is not None else self.config.get("GRADIO_SHARE", False) | |
| server_name = server_name or self.config.get("GRADIO_SERVER_NAME", "127.0.0.1") | |
| server_port = server_port or int(self.config.get("GRADIO_SERVER_PORT", 7860)) | |
| print(f"π Launching Speech-to-Text Application...") | |
| print(f"π Server: http://{server_name}:{server_port}") | |
| print(f"π Share: {share}") | |
| interface.launch( | |
| share=share, | |
| server_name=server_name, | |
| server_port=server_port, | |
| show_error=True, | |
| quiet=False | |
| ) | |
| def main(): | |
| """Main function to run the application.""" | |
| app = GradioSpeechToTextApp() | |
| app.launch() | |
| if __name__ == "__main__": | |
| main() | |