Spaces:
Running
on
Zero
Running
on
Zero
| import spaces | |
| import torch | |
| import os | |
| import tempfile | |
| import time | |
| from contextlib import nullcontext | |
| from functools import lru_cache | |
| from typing import Any | |
| import gradio as gr | |
| import numpy as np | |
| from diffusers import DiffusionPipeline | |
| from gradio_litmodel3d import LitModel3D | |
| from huggingface_hub import login | |
| from PIL import Image | |
| # Authenticate with Hugging Face using token from environment | |
| # HF_TOKEN is automatically available in Hugging Face Spaces | |
| hf_token = os.environ.get("HF_TOKEN") | |
| if hf_token: | |
| # Login to Hugging Face - this stores the token for all HF Hub operations | |
| login(token=hf_token) | |
| # Also ensure it's set as environment variable for any libraries that check it directly | |
| os.environ["HF_TOKEN"] = hf_token | |
| print("Authenticated with Hugging Face") | |
| else: | |
| print("Warning: HF_TOKEN not found. Gated models may not be accessible.") | |
| print("Please ensure HF_TOKEN is set in your Space's secrets.") | |
| if not torch.cuda.is_available(): | |
| raise Exception("CUDA is not available") | |
| # Set environment variables for building texture_baker and uv_unwrapper | |
| os.environ["USE_CUDA"] = "1" | |
| os.environ["USE_NATIVE_ARCH"] = "0" # Disable native arch to avoid build issues | |
| def build_texture_baker_and_uv_unwrapper(): | |
| # Set CUDA architecture list to avoid detection issues | |
| # PyTorch's build system fails when it can't detect GPU architectures | |
| # Setting TORCH_CUDA_ARCH_LIST explicitly prevents this error | |
| if torch.cuda.is_available(): | |
| try: | |
| # Try to get the actual compute capability | |
| compute_cap = torch.cuda.get_device_capability(0) | |
| cuda_arch = f"{compute_cap[0]}.{compute_cap[1]}" | |
| os.environ["TORCH_CUDA_ARCH_LIST"] = cuda_arch | |
| print( | |
| f"Detected CUDA capability: {cuda_arch}, setting TORCH_CUDA_ARCH_LIST={cuda_arch}" | |
| ) | |
| except Exception as e: | |
| # Fallback to common architectures if detection fails | |
| # Include multiple architectures to support various GPU models | |
| fallback_archs = "7.0;7.5;8.0;8.6;8.9;9.0" | |
| os.environ["TORCH_CUDA_ARCH_LIST"] = fallback_archs | |
| print( | |
| f"Could not detect CUDA capability: {e}, using fallback architectures: {fallback_archs}" | |
| ) | |
| else: | |
| # Should not happen since we check above, but just in case | |
| print("Warning: CUDA not available but trying to build with CUDA support") | |
| os.system( | |
| "USE_CUDA=1 USE_NATIVE_ARCH=0 pip install -vv --no-build-isolation ./texture_baker ./uv_unwrapper" | |
| ) | |
| build_texture_baker_and_uv_unwrapper() | |
| import sf3d.utils as sf3d_utils | |
| from sf3d.system import SF3D | |
| # Set up environment | |
| os.environ["GRADIO_TEMP_DIR"] = os.path.join(os.environ.get("TMPDIR", "/tmp"), "gradio") | |
| # Constants for 3D generation | |
| COND_WIDTH = 512 | |
| COND_HEIGHT = 512 | |
| COND_DISTANCE = 1.6 | |
| COND_FOVY_DEG = 40 | |
| BACKGROUND_COLOR = [0.5, 0.5, 0.5] | |
| # Cached. Doesn't change | |
| c2w_cond = sf3d_utils.default_cond_c2w(COND_DISTANCE) | |
| intrinsic, intrinsic_normed_cond = sf3d_utils.create_intrinsic_from_fov_deg( | |
| COND_FOVY_DEG, COND_HEIGHT, COND_WIDTH | |
| ) | |
| generated_files = [] | |
| # Initialize device and SF3D model (like official app) | |
| device = sf3d_utils.get_device() | |
| # SF3D model - initialized at startup like official app | |
| # Token is automatically used after login() call above | |
| print("Loading SF3D model...") | |
| sf3d_model = SF3D.from_pretrained( | |
| "stabilityai/stable-fast-3d", | |
| config_name="config.yaml", | |
| weight_name="model.safetensors", | |
| ) | |
| sf3d_model.eval() | |
| sf3d_model = sf3d_model.to(device) | |
| print("SF3D model loaded!") | |
| # SDXL pipeline - initialized at startup | |
| print("Loading Stable Diffusion XL model...") | |
| sd_pipeline = DiffusionPipeline.from_pretrained( | |
| "stabilityai/stable-diffusion-xl-base-1.0", | |
| torch_dtype=torch.float16 if device == "cuda" else torch.float32, | |
| use_safetensors=True, | |
| variant="fp16" if device == "cuda" else None, | |
| ) | |
| if device == "cuda": | |
| sd_pipeline = sd_pipeline.to(device) | |
| # VAE needs to be in float32 for proper decoding (fixes black image issue) | |
| sd_pipeline.vae.to(torch.float32) | |
| # Enable VAE slicing for better memory and precision handling | |
| try: | |
| sd_pipeline.enable_vae_slicing() | |
| except: | |
| pass | |
| # Enable memory efficient attention if available | |
| try: | |
| sd_pipeline.enable_xformers_memory_efficient_attention() | |
| except: | |
| pass | |
| elif device == "mps": | |
| sd_pipeline = sd_pipeline.to(device) | |
| sd_pipeline.vae.to(torch.float32) | |
| else: | |
| sd_pipeline.enable_model_cpu_offload() | |
| sd_pipeline.vae.to(torch.float32) | |
| print("SDXL model loaded!") | |
| def generate_text_to_image( | |
| prompt: str, negative_prompt: str = "", num_inference_steps: int = 30 | |
| ): | |
| """Generate image from text prompt using SDXL.""" | |
| print(f"Generating image from prompt: {prompt}") | |
| # Generate image | |
| with torch.no_grad(): | |
| if device == "cuda": | |
| # Ensure VAE is in float32 | |
| sd_pipeline.vae.to(torch.float32) | |
| # Temporarily override VAE's forward to ensure float32 decoding | |
| original_vae_decode = sd_pipeline.vae.decode | |
| def vae_decode_wrapper(latents, *args, **kwargs): | |
| # Ensure latents are in float32 for decoding | |
| if latents.dtype != torch.float32: | |
| latents = latents.to(torch.float32) | |
| # Disable autocast for VAE decoding | |
| with torch.cuda.amp.autocast(enabled=False): | |
| return original_vae_decode(latents, *args, **kwargs) | |
| sd_pipeline.vae.decode = vae_decode_wrapper | |
| try: | |
| result = sd_pipeline( | |
| prompt=prompt, | |
| negative_prompt=negative_prompt if negative_prompt else None, | |
| num_inference_steps=num_inference_steps, | |
| ) | |
| image = result.images[0] | |
| finally: | |
| # Restore original decode method | |
| sd_pipeline.vae.decode = original_vae_decode | |
| else: | |
| result = sd_pipeline( | |
| prompt=prompt, | |
| negative_prompt=negative_prompt if negative_prompt else None, | |
| num_inference_steps=num_inference_steps, | |
| ) | |
| image = result.images[0] | |
| return image | |
| def create_batch(input_image: Image) -> dict[str, Any]: | |
| """Create batch for SF3D model - matches official app structure.""" | |
| img_cond = ( | |
| torch.from_numpy( | |
| np.asarray(input_image.resize((COND_WIDTH, COND_HEIGHT))).astype(np.float32) | |
| / 255.0 | |
| ) | |
| .float() | |
| .clip(0, 1) | |
| ) | |
| mask_cond = img_cond[:, :, -1:] | |
| rgb_cond = torch.lerp( | |
| torch.tensor(BACKGROUND_COLOR)[None, None, :], img_cond[:, :, :3], mask_cond | |
| ) | |
| batch_elem = { | |
| "rgb_cond": rgb_cond, | |
| "mask_cond": mask_cond, | |
| "c2w_cond": c2w_cond.unsqueeze(0), | |
| "intrinsic_cond": intrinsic.unsqueeze(0), | |
| "intrinsic_normed_cond": intrinsic_normed_cond.unsqueeze(0), | |
| } | |
| # Add batch dim | |
| batched = {k: v.unsqueeze(0) for k, v in batch_elem.items()} | |
| return batched | |
| def run_model(input_image, remesh_option, vertex_count, texture_size): | |
| """Run SF3D model - matches official app structure.""" | |
| start = time.time() | |
| with torch.no_grad(): | |
| with ( | |
| torch.autocast(device_type=device, dtype=torch.bfloat16) | |
| if "cuda" in device | |
| else nullcontext() | |
| ): | |
| model_batch = create_batch(input_image) | |
| model_batch = {k: v.to(device) for k, v in model_batch.items()} | |
| trimesh_mesh, _glob_dict = sf3d_model.generate_mesh( | |
| model_batch, texture_size, remesh_option.lower(), vertex_count | |
| ) | |
| trimesh_mesh = trimesh_mesh[0] | |
| # Create new tmp file in Gradio temp directory for proper serving | |
| os.makedirs(os.environ["GRADIO_TEMP_DIR"], exist_ok=True) | |
| tmp_file = tempfile.NamedTemporaryFile( | |
| delete=False, suffix=".glb", dir=os.environ["GRADIO_TEMP_DIR"] | |
| ) | |
| trimesh_mesh.export(tmp_file.name, file_type="glb", include_normals=True) | |
| generated_files.append(tmp_file.name) | |
| print("Generation took:", time.time() - start, "s") | |
| print(f"GLB file saved to: {tmp_file.name}") | |
| return tmp_file.name | |
| def generate_3d_from_image( | |
| input_image: Image.Image, | |
| remesh_option: str = "none", | |
| vertex_count: int = -1, | |
| texture_size: int = 1024, | |
| ) -> str: | |
| """Generate 3D mesh from image using SF3D with built-in background removal.""" | |
| # Convert to RGB if needed (SDXL outputs RGB) | |
| if input_image.mode != "RGB": | |
| input_image = input_image.convert("RGB") | |
| # Use SF3D's built-in background removal | |
| # This handles the conversion to RGBA and background removal | |
| print("Removing background using SF3D's built-in function...") | |
| image_with_bg_removed = sf3d_utils.remove_background(input_image) | |
| # Resize foreground if needed (like official app) | |
| foreground_ratio = 0.85 | |
| processed_image = sf3d_utils.resize_foreground( | |
| image_with_bg_removed, foreground_ratio, out_size=(COND_WIDTH, COND_HEIGHT) | |
| ) | |
| return run_model(processed_image, remesh_option, vertex_count, texture_size) | |
| # Gradio Interface Functions | |
| def step1_generate_image(prompt, negative_prompt, num_steps): | |
| """Step 1: Generate image from text.""" | |
| if not prompt: | |
| return None, None | |
| try: | |
| image = generate_text_to_image(prompt, negative_prompt, num_steps) | |
| return ( | |
| image, | |
| image, # Auto-fill Step 2 image input | |
| ) | |
| except Exception as e: | |
| return None, None | |
| def step2_generate_3d(image, remesh_option, vertex_count, texture_size): | |
| """Step 2: Generate 3D model from image (with built-in background removal).""" | |
| if image is None: | |
| return ( | |
| None, | |
| None, | |
| ) | |
| try: | |
| glb_file = generate_3d_from_image( | |
| image, remesh_option, vertex_count, texture_size | |
| ) | |
| return ( | |
| glb_file, # Direct file path for LitModel3D | |
| glb_file, # Also return for file download component | |
| ) | |
| except Exception as e: | |
| return ( | |
| None, | |
| None, | |
| ) | |
| # Create Gradio Interface | |
| custom_css = """ | |
| .container { | |
| max-width: 50%; | |
| margin: 0 auto; | |
| } | |
| .container textarea[data-testid*="textbox"], | |
| .container input[type="text"] { | |
| width: 100% !important; | |
| box-sizing: border-box; | |
| } | |
| @media (max-width: 768px) { | |
| .container { | |
| max-width: 100%; | |
| } | |
| } | |
| """ | |
| with gr.Blocks(title="Text to Image to 3D", css=custom_css) as demo: | |
| # Wrap all content including header in a centered container | |
| with gr.Column(elem_classes=["container"]): | |
| gr.Markdown( | |
| """ | |
| # Text to Image to 3D Generation | |
| This app allows you to generate 3D models from text prompts in two steps: | |
| 1. **Text to Image**: Generate an image using Stable Diffusion XL | |
| 2. **3D Generation**: Create a 3D mesh model using Stable Fast 3D (with automatic background removal) | |
| **Instructions:** | |
| - Enter your text prompt and generate an image | |
| - Review the generated image and continue to generate the 3D model | |
| - Background removal is handled automatically by Stable Fast 3D | |
| - View and download your 3D model as a GLB file | |
| """ | |
| ) | |
| # Step 1: Text to Image | |
| gr.Markdown("## Step 1: Text to Image") | |
| # Image generation form | |
| prompt = gr.Textbox( | |
| label="Prompt", | |
| placeholder="A cute robot character, 3D render, colorful", | |
| lines=2, | |
| ) | |
| negative_prompt = gr.Textbox( | |
| label="Negative Prompt (optional)", | |
| placeholder="blurry, low quality, distorted", | |
| lines=2, | |
| ) | |
| num_steps = gr.Slider( | |
| label="Number of Inference Steps", | |
| minimum=20, | |
| maximum=50, | |
| value=30, | |
| step=5, | |
| ) | |
| generate_btn = gr.Button("Generate Image", variant="primary") | |
| # Image preview | |
| step1_image = gr.Image(label="Generated Image", type="pil") | |
| # Step 2: 3D Generation | |
| gr.Markdown("## Step 2: 3D Generation") | |
| gr.Markdown( | |
| "*Background removal is handled automatically. You can use the image from Step 1 or upload your own image.*" | |
| ) | |
| # 3D generation input image | |
| step2_image_input = gr.Image( | |
| label="Input Image", | |
| type="pil", | |
| sources=["upload", "clipboard"], | |
| ) | |
| # 3D generation form | |
| remesh_option = gr.Radio( | |
| choices=["none", "triangle", "quad"], | |
| label="Remeshing Option", | |
| value="none", | |
| ) | |
| vertex_count = gr.Slider( | |
| label="Target Vertex Count (-1 for auto)", | |
| minimum=-1, | |
| maximum=20000, | |
| value=-1, | |
| step=100, | |
| ) | |
| texture_size = gr.Slider( | |
| label="Texture Size", | |
| minimum=512, | |
| maximum=2048, | |
| value=1024, | |
| step=256, | |
| ) | |
| step2_generate_btn = gr.Button("Generate 3D Model", variant="primary") | |
| # 3D model preview | |
| step2_output = LitModel3D( | |
| label="3D Model Preview", | |
| visible=True, | |
| clear_color=[0.0, 0.0, 0.0, 0.0], | |
| height=600, # Set explicit height for better visibility | |
| ) | |
| # File download component | |
| step2_download = gr.File( | |
| label="Download 3D Model (GLB)", | |
| visible=True, | |
| ) | |
| # Event handlers | |
| generate_btn.click( | |
| fn=step1_generate_image, | |
| inputs=[prompt, negative_prompt, num_steps], | |
| outputs=[step1_image, step2_image_input], | |
| ) | |
| step2_generate_btn.click( | |
| fn=step2_generate_3d, | |
| inputs=[step2_image_input, remesh_option, vertex_count, texture_size], | |
| outputs=[step2_output, step2_download], | |
| ) | |
| if __name__ == "__main__": | |
| # Delete previous gradio temp dir folder (like official app) | |
| if os.path.exists(os.environ["GRADIO_TEMP_DIR"]): | |
| print(f"Deleting {os.environ['GRADIO_TEMP_DIR']}") | |
| import shutil | |
| shutil.rmtree(os.environ["GRADIO_TEMP_DIR"]) | |
| demo.queue() | |
| demo.launch(share=False) | |