Spaces:

vaibhavpandeyvpz
/

stable-diffusion-fast-text-to-3d

Running on Zero

App Files Files Community

stable-diffusion-fast-text-to-3d / app.py

vaibhavpandeyvpz

Use default theme

fd64439 4 days ago

raw

history blame contribute delete

14.5 kB

	import spaces
	import torch
	import os
	import tempfile
	import time
	from contextlib import nullcontext
	from functools import lru_cache
	from typing import Any

	import gradio as gr
	import numpy as np
	from diffusers import DiffusionPipeline
	from gradio_litmodel3d import LitModel3D
	from huggingface_hub import login
	from PIL import Image

	# Authenticate with Hugging Face using token from environment
	# HF_TOKEN is automatically available in Hugging Face Spaces
	hf_token = os.environ.get("HF_TOKEN")
	if hf_token:
	# Login to Hugging Face - this stores the token for all HF Hub operations
	login(token=hf_token)
	# Also ensure it's set as environment variable for any libraries that check it directly
	os.environ["HF_TOKEN"] = hf_token
	print("Authenticated with Hugging Face")
	else:
	print("Warning: HF_TOKEN not found. Gated models may not be accessible.")
	print("Please ensure HF_TOKEN is set in your Space's secrets.")

	if not torch.cuda.is_available():
	raise Exception("CUDA is not available")

	# Set environment variables for building texture_baker and uv_unwrapper
	os.environ["USE_CUDA"] = "1"
	os.environ["USE_NATIVE_ARCH"] = "0" # Disable native arch to avoid build issues


	def build_texture_baker_and_uv_unwrapper():
	# Set CUDA architecture list to avoid detection issues
	# PyTorch's build system fails when it can't detect GPU architectures
	# Setting TORCH_CUDA_ARCH_LIST explicitly prevents this error
	if torch.cuda.is_available():
	try:
	# Try to get the actual compute capability
	compute_cap = torch.cuda.get_device_capability(0)
	cuda_arch = f"{compute_cap[0]}.{compute_cap[1]}"
	os.environ["TORCH_CUDA_ARCH_LIST"] = cuda_arch
	print(
	f"Detected CUDA capability: {cuda_arch}, setting TORCH_CUDA_ARCH_LIST={cuda_arch}"
	)
	except Exception as e:
	# Fallback to common architectures if detection fails
	# Include multiple architectures to support various GPU models
	fallback_archs = "7.0;7.5;8.0;8.6;8.9;9.0"
	os.environ["TORCH_CUDA_ARCH_LIST"] = fallback_archs
	print(
	f"Could not detect CUDA capability: {e}, using fallback architectures: {fallback_archs}"
	)
	else:
	# Should not happen since we check above, but just in case
	print("Warning: CUDA not available but trying to build with CUDA support")

	os.system(
	"USE_CUDA=1 USE_NATIVE_ARCH=0 pip install -vv --no-build-isolation ./texture_baker ./uv_unwrapper"
	)


	build_texture_baker_and_uv_unwrapper()

	import sf3d.utils as sf3d_utils
	from sf3d.system import SF3D

	# Set up environment
	os.environ["GRADIO_TEMP_DIR"] = os.path.join(os.environ.get("TMPDIR", "/tmp"), "gradio")

	# Constants for 3D generation
	COND_WIDTH = 512
	COND_HEIGHT = 512
	COND_DISTANCE = 1.6
	COND_FOVY_DEG = 40
	BACKGROUND_COLOR = [0.5, 0.5, 0.5]

	# Cached. Doesn't change
	c2w_cond = sf3d_utils.default_cond_c2w(COND_DISTANCE)
	intrinsic, intrinsic_normed_cond = sf3d_utils.create_intrinsic_from_fov_deg(
	COND_FOVY_DEG, COND_HEIGHT, COND_WIDTH
	)

	generated_files = []

	# Initialize device and SF3D model (like official app)
	device = sf3d_utils.get_device()

	# SF3D model - initialized at startup like official app
	# Token is automatically used after login() call above
	print("Loading SF3D model...")
	sf3d_model = SF3D.from_pretrained(
	"stabilityai/stable-fast-3d",
	config_name="config.yaml",
	weight_name="model.safetensors",
	)
	sf3d_model.eval()
	sf3d_model = sf3d_model.to(device)
	print("SF3D model loaded!")

	# SDXL pipeline - initialized at startup
	print("Loading Stable Diffusion XL model...")
	sd_pipeline = DiffusionPipeline.from_pretrained(
	"stabilityai/stable-diffusion-xl-base-1.0",
	torch_dtype=torch.float16 if device == "cuda" else torch.float32,
	use_safetensors=True,
	variant="fp16" if device == "cuda" else None,
	)
	if device == "cuda":
	sd_pipeline = sd_pipeline.to(device)
	# VAE needs to be in float32 for proper decoding (fixes black image issue)
	sd_pipeline.vae.to(torch.float32)
	# Enable VAE slicing for better memory and precision handling
	try:
	sd_pipeline.enable_vae_slicing()
	except:
	pass
	# Enable memory efficient attention if available
	try:
	sd_pipeline.enable_xformers_memory_efficient_attention()
	except:
	pass
	elif device == "mps":
	sd_pipeline = sd_pipeline.to(device)
	sd_pipeline.vae.to(torch.float32)
	else:
	sd_pipeline.enable_model_cpu_offload()
	sd_pipeline.vae.to(torch.float32)
	print("SDXL model loaded!")


	@spaces.GPU()
	def generate_text_to_image(
	prompt: str, negative_prompt: str = "", num_inference_steps: int = 30
	):
	"""Generate image from text prompt using SDXL."""
	print(f"Generating image from prompt: {prompt}")

	# Generate image
	with torch.no_grad():
	if device == "cuda":
	# Ensure VAE is in float32
	sd_pipeline.vae.to(torch.float32)

	# Temporarily override VAE's forward to ensure float32 decoding
	original_vae_decode = sd_pipeline.vae.decode

	def vae_decode_wrapper(latents, args, *kwargs):
	# Ensure latents are in float32 for decoding
	if latents.dtype != torch.float32:
	latents = latents.to(torch.float32)
	# Disable autocast for VAE decoding
	with torch.cuda.amp.autocast(enabled=False):
	return original_vae_decode(latents, args, *kwargs)

	sd_pipeline.vae.decode = vae_decode_wrapper

	try:
	result = sd_pipeline(
	prompt=prompt,
	negative_prompt=negative_prompt if negative_prompt else None,
	num_inference_steps=num_inference_steps,
	)
	image = result.images[0]
	finally:
	# Restore original decode method
	sd_pipeline.vae.decode = original_vae_decode
	else:
	result = sd_pipeline(
	prompt=prompt,
	negative_prompt=negative_prompt if negative_prompt else None,
	num_inference_steps=num_inference_steps,
	)
	image = result.images[0]

	return image


	def create_batch(input_image: Image) -> dict[str, Any]:
	"""Create batch for SF3D model - matches official app structure."""
	img_cond = (
	torch.from_numpy(
	np.asarray(input_image.resize((COND_WIDTH, COND_HEIGHT))).astype(np.float32)
	/ 255.0
	)
	.float()
	.clip(0, 1)
	)
	mask_cond = img_cond[:, :, -1:]
	rgb_cond = torch.lerp(
	torch.tensor(BACKGROUND_COLOR)[None, None, :], img_cond[:, :, :3], mask_cond
	)

	batch_elem = {
	"rgb_cond": rgb_cond,
	"mask_cond": mask_cond,
	"c2w_cond": c2w_cond.unsqueeze(0),
	"intrinsic_cond": intrinsic.unsqueeze(0),
	"intrinsic_normed_cond": intrinsic_normed_cond.unsqueeze(0),
	}
	# Add batch dim
	batched = {k: v.unsqueeze(0) for k, v in batch_elem.items()}
	return batched


	def run_model(input_image, remesh_option, vertex_count, texture_size):
	"""Run SF3D model - matches official app structure."""
	start = time.time()
	with torch.no_grad():
	with (
	torch.autocast(device_type=device, dtype=torch.bfloat16)
	if "cuda" in device
	else nullcontext()
	):
	model_batch = create_batch(input_image)
	model_batch = {k: v.to(device) for k, v in model_batch.items()}
	trimesh_mesh, _glob_dict = sf3d_model.generate_mesh(
	model_batch, texture_size, remesh_option.lower(), vertex_count
	)
	trimesh_mesh = trimesh_mesh[0]

	# Create new tmp file in Gradio temp directory for proper serving
	os.makedirs(os.environ["GRADIO_TEMP_DIR"], exist_ok=True)
	tmp_file = tempfile.NamedTemporaryFile(
	delete=False, suffix=".glb", dir=os.environ["GRADIO_TEMP_DIR"]
	)

	trimesh_mesh.export(tmp_file.name, file_type="glb", include_normals=True)
	generated_files.append(tmp_file.name)

	print("Generation took:", time.time() - start, "s")
	print(f"GLB file saved to: {tmp_file.name}")

	return tmp_file.name


	@spaces.GPU()
	def generate_3d_from_image(
	input_image: Image.Image,
	remesh_option: str = "none",
	vertex_count: int = -1,
	texture_size: int = 1024,
	) -> str:
	"""Generate 3D mesh from image using SF3D with built-in background removal."""
	# Convert to RGB if needed (SDXL outputs RGB)
	if input_image.mode != "RGB":
	input_image = input_image.convert("RGB")

	# Use SF3D's built-in background removal
	# This handles the conversion to RGBA and background removal
	print("Removing background using SF3D's built-in function...")
	image_with_bg_removed = sf3d_utils.remove_background(input_image)

	# Resize foreground if needed (like official app)
	foreground_ratio = 0.85
	processed_image = sf3d_utils.resize_foreground(
	image_with_bg_removed, foreground_ratio, out_size=(COND_WIDTH, COND_HEIGHT)
	)

	return run_model(processed_image, remesh_option, vertex_count, texture_size)


	# Gradio Interface Functions
	def step1_generate_image(prompt, negative_prompt, num_steps):
	"""Step 1: Generate image from text."""
	if not prompt:
	return None, None

	try:
	image = generate_text_to_image(prompt, negative_prompt, num_steps)
	return (
	image,
	image, # Auto-fill Step 2 image input
	)
	except Exception as e:
	return None, None


	def step2_generate_3d(image, remesh_option, vertex_count, texture_size):
	"""Step 2: Generate 3D model from image (with built-in background removal)."""
	if image is None:
	return (
	None,
	None,
	)

	try:
	glb_file = generate_3d_from_image(
	image, remesh_option, vertex_count, texture_size
	)

	return (
	glb_file, # Direct file path for LitModel3D
	glb_file, # Also return for file download component
	)
	except Exception as e:
	return (
	None,
	None,
	)


	# Create Gradio Interface
	custom_css = """
	.container {
	max-width: 50%;
	margin: 0 auto;
	}
	.container textarea[data-testid*="textbox"],
	.container input[type="text"] {
	width: 100% !important;
	box-sizing: border-box;
	}
	@media (max-width: 768px) {
	.container {
	max-width: 100%;
	}
	}
	"""

	with gr.Blocks(title="Text to Image to 3D", css=custom_css) as demo:
	# Wrap all content including header in a centered container
	with gr.Column(elem_classes=["container"]):
	gr.Markdown(
	"""
	# Text to Image to 3D Generation

	This app allows you to generate 3D models from text prompts in two steps:
	1. Text to Image: Generate an image using Stable Diffusion XL
	2. 3D Generation: Create a 3D mesh model using Stable Fast 3D (with automatic background removal)

	Instructions:
	- Enter your text prompt and generate an image
	- Review the generated image and continue to generate the 3D model
	- Background removal is handled automatically by Stable Fast 3D
	- View and download your 3D model as a GLB file
	"""
	)

	# Step 1: Text to Image
	gr.Markdown("## Step 1: Text to Image")

	# Image generation form
	prompt = gr.Textbox(
	label="Prompt",
	placeholder="A cute robot character, 3D render, colorful",
	lines=2,
	)
	negative_prompt = gr.Textbox(
	label="Negative Prompt (optional)",
	placeholder="blurry, low quality, distorted",
	lines=2,
	)
	num_steps = gr.Slider(
	label="Number of Inference Steps",
	minimum=20,
	maximum=50,
	value=30,
	step=5,
	)
	generate_btn = gr.Button("Generate Image", variant="primary")

	# Image preview
	step1_image = gr.Image(label="Generated Image", type="pil")

	# Step 2: 3D Generation
	gr.Markdown("## Step 2: 3D Generation")
	gr.Markdown(
	"Background removal is handled automatically. You can use the image from Step 1 or upload your own image."
	)

	# 3D generation input image
	step2_image_input = gr.Image(
	label="Input Image",
	type="pil",
	sources=["upload", "clipboard"],
	)

	# 3D generation form
	remesh_option = gr.Radio(
	choices=["none", "triangle", "quad"],
	label="Remeshing Option",
	value="none",
	)
	vertex_count = gr.Slider(
	label="Target Vertex Count (-1 for auto)",
	minimum=-1,
	maximum=20000,
	value=-1,
	step=100,
	)
	texture_size = gr.Slider(
	label="Texture Size",
	minimum=512,
	maximum=2048,
	value=1024,
	step=256,
	)
	step2_generate_btn = gr.Button("Generate 3D Model", variant="primary")

	# 3D model preview
	step2_output = LitModel3D(
	label="3D Model Preview",
	visible=True,
	clear_color=[0.0, 0.0, 0.0, 0.0],
	height=600, # Set explicit height for better visibility
	)

	# File download component
	step2_download = gr.File(
	label="Download 3D Model (GLB)",
	visible=True,
	)

	# Event handlers
	generate_btn.click(
	fn=step1_generate_image,
	inputs=[prompt, negative_prompt, num_steps],
	outputs=[step1_image, step2_image_input],
	)

	step2_generate_btn.click(
	fn=step2_generate_3d,
	inputs=[step2_image_input, remesh_option, vertex_count, texture_size],
	outputs=[step2_output, step2_download],
	)


	if __name__ == "__main__":
	# Delete previous gradio temp dir folder (like official app)
	if os.path.exists(os.environ["GRADIO_TEMP_DIR"]):
	print(f"Deleting {os.environ['GRADIO_TEMP_DIR']}")
	import shutil

	shutil.rmtree(os.environ["GRADIO_TEMP_DIR"])

	demo.queue()
	demo.launch(share=False)