import spaces import torch import os import tempfile import time from contextlib import nullcontext from functools import lru_cache from typing import Any import gradio as gr import numpy as np from diffusers import DiffusionPipeline from gradio_litmodel3d import LitModel3D from huggingface_hub import login from PIL import Image # Authenticate with Hugging Face using token from environment # HF_TOKEN is automatically available in Hugging Face Spaces hf_token = os.environ.get("HF_TOKEN") if hf_token: # Login to Hugging Face - this stores the token for all HF Hub operations login(token=hf_token) # Also ensure it's set as environment variable for any libraries that check it directly os.environ["HF_TOKEN"] = hf_token print("Authenticated with Hugging Face") else: print("Warning: HF_TOKEN not found. Gated models may not be accessible.") print("Please ensure HF_TOKEN is set in your Space's secrets.") if not torch.cuda.is_available(): raise Exception("CUDA is not available") # Set environment variables for building texture_baker and uv_unwrapper os.environ["USE_CUDA"] = "1" os.environ["USE_NATIVE_ARCH"] = "0" # Disable native arch to avoid build issues def build_texture_baker_and_uv_unwrapper(): # Set CUDA architecture list to avoid detection issues # PyTorch's build system fails when it can't detect GPU architectures # Setting TORCH_CUDA_ARCH_LIST explicitly prevents this error if torch.cuda.is_available(): try: # Try to get the actual compute capability compute_cap = torch.cuda.get_device_capability(0) cuda_arch = f"{compute_cap[0]}.{compute_cap[1]}" os.environ["TORCH_CUDA_ARCH_LIST"] = cuda_arch print( f"Detected CUDA capability: {cuda_arch}, setting TORCH_CUDA_ARCH_LIST={cuda_arch}" ) except Exception as e: # Fallback to common architectures if detection fails # Include multiple architectures to support various GPU models fallback_archs = "7.0;7.5;8.0;8.6;8.9;9.0" os.environ["TORCH_CUDA_ARCH_LIST"] = fallback_archs print( f"Could not detect CUDA capability: {e}, using fallback architectures: {fallback_archs}" ) else: # Should not happen since we check above, but just in case print("Warning: CUDA not available but trying to build with CUDA support") os.system( "USE_CUDA=1 USE_NATIVE_ARCH=0 pip install -vv --no-build-isolation ./texture_baker ./uv_unwrapper" ) build_texture_baker_and_uv_unwrapper() import sf3d.utils as sf3d_utils from sf3d.system import SF3D # Set up environment os.environ["GRADIO_TEMP_DIR"] = os.path.join(os.environ.get("TMPDIR", "/tmp"), "gradio") # Constants for 3D generation COND_WIDTH = 512 COND_HEIGHT = 512 COND_DISTANCE = 1.6 COND_FOVY_DEG = 40 BACKGROUND_COLOR = [0.5, 0.5, 0.5] # Cached. Doesn't change c2w_cond = sf3d_utils.default_cond_c2w(COND_DISTANCE) intrinsic, intrinsic_normed_cond = sf3d_utils.create_intrinsic_from_fov_deg( COND_FOVY_DEG, COND_HEIGHT, COND_WIDTH ) generated_files = [] # Initialize device and SF3D model (like official app) device = sf3d_utils.get_device() # SF3D model - initialized at startup like official app # Token is automatically used after login() call above print("Loading SF3D model...") sf3d_model = SF3D.from_pretrained( "stabilityai/stable-fast-3d", config_name="config.yaml", weight_name="model.safetensors", ) sf3d_model.eval() sf3d_model = sf3d_model.to(device) print("SF3D model loaded!") # SDXL pipeline - initialized at startup print("Loading Stable Diffusion XL model...") sd_pipeline = DiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 if device == "cuda" else torch.float32, use_safetensors=True, variant="fp16" if device == "cuda" else None, ) if device == "cuda": sd_pipeline = sd_pipeline.to(device) # VAE needs to be in float32 for proper decoding (fixes black image issue) sd_pipeline.vae.to(torch.float32) # Enable VAE slicing for better memory and precision handling try: sd_pipeline.enable_vae_slicing() except: pass # Enable memory efficient attention if available try: sd_pipeline.enable_xformers_memory_efficient_attention() except: pass elif device == "mps": sd_pipeline = sd_pipeline.to(device) sd_pipeline.vae.to(torch.float32) else: sd_pipeline.enable_model_cpu_offload() sd_pipeline.vae.to(torch.float32) print("SDXL model loaded!") @spaces.GPU() def generate_text_to_image( prompt: str, negative_prompt: str = "", num_inference_steps: int = 30 ): """Generate image from text prompt using SDXL.""" print(f"Generating image from prompt: {prompt}") # Generate image with torch.no_grad(): if device == "cuda": # Ensure VAE is in float32 sd_pipeline.vae.to(torch.float32) # Temporarily override VAE's forward to ensure float32 decoding original_vae_decode = sd_pipeline.vae.decode def vae_decode_wrapper(latents, *args, **kwargs): # Ensure latents are in float32 for decoding if latents.dtype != torch.float32: latents = latents.to(torch.float32) # Disable autocast for VAE decoding with torch.cuda.amp.autocast(enabled=False): return original_vae_decode(latents, *args, **kwargs) sd_pipeline.vae.decode = vae_decode_wrapper try: result = sd_pipeline( prompt=prompt, negative_prompt=negative_prompt if negative_prompt else None, num_inference_steps=num_inference_steps, ) image = result.images[0] finally: # Restore original decode method sd_pipeline.vae.decode = original_vae_decode else: result = sd_pipeline( prompt=prompt, negative_prompt=negative_prompt if negative_prompt else None, num_inference_steps=num_inference_steps, ) image = result.images[0] return image def create_batch(input_image: Image) -> dict[str, Any]: """Create batch for SF3D model - matches official app structure.""" img_cond = ( torch.from_numpy( np.asarray(input_image.resize((COND_WIDTH, COND_HEIGHT))).astype(np.float32) / 255.0 ) .float() .clip(0, 1) ) mask_cond = img_cond[:, :, -1:] rgb_cond = torch.lerp( torch.tensor(BACKGROUND_COLOR)[None, None, :], img_cond[:, :, :3], mask_cond ) batch_elem = { "rgb_cond": rgb_cond, "mask_cond": mask_cond, "c2w_cond": c2w_cond.unsqueeze(0), "intrinsic_cond": intrinsic.unsqueeze(0), "intrinsic_normed_cond": intrinsic_normed_cond.unsqueeze(0), } # Add batch dim batched = {k: v.unsqueeze(0) for k, v in batch_elem.items()} return batched def run_model(input_image, remesh_option, vertex_count, texture_size): """Run SF3D model - matches official app structure.""" start = time.time() with torch.no_grad(): with ( torch.autocast(device_type=device, dtype=torch.bfloat16) if "cuda" in device else nullcontext() ): model_batch = create_batch(input_image) model_batch = {k: v.to(device) for k, v in model_batch.items()} trimesh_mesh, _glob_dict = sf3d_model.generate_mesh( model_batch, texture_size, remesh_option.lower(), vertex_count ) trimesh_mesh = trimesh_mesh[0] # Create new tmp file in Gradio temp directory for proper serving os.makedirs(os.environ["GRADIO_TEMP_DIR"], exist_ok=True) tmp_file = tempfile.NamedTemporaryFile( delete=False, suffix=".glb", dir=os.environ["GRADIO_TEMP_DIR"] ) trimesh_mesh.export(tmp_file.name, file_type="glb", include_normals=True) generated_files.append(tmp_file.name) print("Generation took:", time.time() - start, "s") print(f"GLB file saved to: {tmp_file.name}") return tmp_file.name @spaces.GPU() def generate_3d_from_image( input_image: Image.Image, remesh_option: str = "none", vertex_count: int = -1, texture_size: int = 1024, ) -> str: """Generate 3D mesh from image using SF3D with built-in background removal.""" # Convert to RGB if needed (SDXL outputs RGB) if input_image.mode != "RGB": input_image = input_image.convert("RGB") # Use SF3D's built-in background removal # This handles the conversion to RGBA and background removal print("Removing background using SF3D's built-in function...") image_with_bg_removed = sf3d_utils.remove_background(input_image) # Resize foreground if needed (like official app) foreground_ratio = 0.85 processed_image = sf3d_utils.resize_foreground( image_with_bg_removed, foreground_ratio, out_size=(COND_WIDTH, COND_HEIGHT) ) return run_model(processed_image, remesh_option, vertex_count, texture_size) # Gradio Interface Functions def step1_generate_image(prompt, negative_prompt, num_steps): """Step 1: Generate image from text.""" if not prompt: return None, None try: image = generate_text_to_image(prompt, negative_prompt, num_steps) return ( image, image, # Auto-fill Step 2 image input ) except Exception as e: return None, None def step2_generate_3d(image, remesh_option, vertex_count, texture_size): """Step 2: Generate 3D model from image (with built-in background removal).""" if image is None: return ( None, None, ) try: glb_file = generate_3d_from_image( image, remesh_option, vertex_count, texture_size ) return ( glb_file, # Direct file path for LitModel3D glb_file, # Also return for file download component ) except Exception as e: return ( None, None, ) # Create Gradio Interface custom_css = """ .container { max-width: 50%; margin: 0 auto; } .container textarea[data-testid*="textbox"], .container input[type="text"] { width: 100% !important; box-sizing: border-box; } @media (max-width: 768px) { .container { max-width: 100%; } } """ with gr.Blocks(title="Text to Image to 3D", css=custom_css) as demo: # Wrap all content including header in a centered container with gr.Column(elem_classes=["container"]): gr.Markdown( """ # Text to Image to 3D Generation This app allows you to generate 3D models from text prompts in two steps: 1. **Text to Image**: Generate an image using Stable Diffusion XL 2. **3D Generation**: Create a 3D mesh model using Stable Fast 3D (with automatic background removal) **Instructions:** - Enter your text prompt and generate an image - Review the generated image and continue to generate the 3D model - Background removal is handled automatically by Stable Fast 3D - View and download your 3D model as a GLB file """ ) # Step 1: Text to Image gr.Markdown("## Step 1: Text to Image") # Image generation form prompt = gr.Textbox( label="Prompt", placeholder="A cute robot character, 3D render, colorful", lines=2, ) negative_prompt = gr.Textbox( label="Negative Prompt (optional)", placeholder="blurry, low quality, distorted", lines=2, ) num_steps = gr.Slider( label="Number of Inference Steps", minimum=20, maximum=50, value=30, step=5, ) generate_btn = gr.Button("Generate Image", variant="primary") # Image preview step1_image = gr.Image(label="Generated Image", type="pil") # Step 2: 3D Generation gr.Markdown("## Step 2: 3D Generation") gr.Markdown( "*Background removal is handled automatically. You can use the image from Step 1 or upload your own image.*" ) # 3D generation input image step2_image_input = gr.Image( label="Input Image", type="pil", sources=["upload", "clipboard"], ) # 3D generation form remesh_option = gr.Radio( choices=["none", "triangle", "quad"], label="Remeshing Option", value="none", ) vertex_count = gr.Slider( label="Target Vertex Count (-1 for auto)", minimum=-1, maximum=20000, value=-1, step=100, ) texture_size = gr.Slider( label="Texture Size", minimum=512, maximum=2048, value=1024, step=256, ) step2_generate_btn = gr.Button("Generate 3D Model", variant="primary") # 3D model preview step2_output = LitModel3D( label="3D Model Preview", visible=True, clear_color=[0.0, 0.0, 0.0, 0.0], height=600, # Set explicit height for better visibility ) # File download component step2_download = gr.File( label="Download 3D Model (GLB)", visible=True, ) # Event handlers generate_btn.click( fn=step1_generate_image, inputs=[prompt, negative_prompt, num_steps], outputs=[step1_image, step2_image_input], ) step2_generate_btn.click( fn=step2_generate_3d, inputs=[step2_image_input, remesh_option, vertex_count, texture_size], outputs=[step2_output, step2_download], ) if __name__ == "__main__": # Delete previous gradio temp dir folder (like official app) if os.path.exists(os.environ["GRADIO_TEMP_DIR"]): print(f"Deleting {os.environ['GRADIO_TEMP_DIR']}") import shutil shutil.rmtree(os.environ["GRADIO_TEMP_DIR"]) demo.queue() demo.launch(share=False)