|
|
import marimo |
|
|
|
|
|
__generated_with = "0.10.9" |
|
|
app = marimo.App(width="medium") |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _(): |
|
|
import marimo as mo |
|
|
return (mo,) |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _(mo): |
|
|
mo.md( |
|
|
""" |
|
|
# VLM vs Text: Extracting Metadata from Book Covers |
|
|
|
|
|
**The Task**: Libraries and archives have millions of digitized book covers where metadata is incomplete or missing. Can we use AI to automatically extract titles and other metadata? |
|
|
|
|
|
**The Question**: Should we use Vision-Language Models (VLMs) that "see" the cover image, or extract text first and send it to a standard LLM? |
|
|
|
|
|
**The Answer**: VLMs win decisively for this task. |
|
|
|
|
|
--- |
|
|
|
|
|
This evaluation uses the [DOAB (Directory of Open Access Books)](https://huggingface.co/datasets/biglam/doab-metadata-extraction) dataset of academic book covers. We compare two approaches: |
|
|
|
|
|
| Approach | How it works | |
|
|
|----------|-------------| |
|
|
| **VLM** | Send the cover image directly to a Vision-Language Model | |
|
|
| **Text** | Extract text from image first (OCR), then send to an LLM | |
|
|
|
|
|
--- |
|
|
|
|
|
## Evaluation Results |
|
|
|
|
|
Select a task below to see how different models performed: |
|
|
""" |
|
|
) |
|
|
return |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _(): |
|
|
import pandas as pd |
|
|
import altair as alt |
|
|
from inspect_ai.analysis import evals_df |
|
|
return alt, evals_df, pd |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _(evals_df, mo): |
|
|
|
|
|
with mo.persistent_cache(name="doab_evals"): |
|
|
df_raw = evals_df("hf://datasets/davanstrien/doab-title-extraction-evals", quiet=True) |
|
|
|
|
|
|
|
|
df_raw["approach"] = df_raw["task_name"].apply(lambda x: "VLM" if "vlm" in x else "Text") |
|
|
df_raw["model_short"] = df_raw["model"].apply(lambda x: x.split("/")[-1]) |
|
|
|
|
|
|
|
|
def get_task_category(task_name): |
|
|
if "llm_judge" in task_name: |
|
|
return "Full Metadata" |
|
|
else: |
|
|
return "Title Extraction" |
|
|
|
|
|
df_raw["task_category"] = df_raw["task_name"].apply(get_task_category) |
|
|
|
|
|
|
|
|
df_raw["accuracy"] = df_raw["score_headline_value"] * 100 |
|
|
|
|
|
|
|
|
model_info = { |
|
|
"hf-inference-providers/Qwen/Qwen3-VL-8B-Instruct": { |
|
|
"params": 8, |
|
|
"url": "https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct" |
|
|
}, |
|
|
"hf-inference-providers/Qwen/Qwen3-VL-30B-A3B-Thinking": { |
|
|
"params": 30, |
|
|
"url": "https://huggingface.co/Qwen/Qwen3-VL-30B-A3B" |
|
|
}, |
|
|
"hf-inference-providers/zai-org/GLM-4.6V-Flash": { |
|
|
"params": 9, |
|
|
"url": "https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking" |
|
|
}, |
|
|
"hf-inference-providers/openai/gpt-oss-20b": { |
|
|
"params": 20, |
|
|
"url": "https://huggingface.co/openai/gpt-oss-20b" |
|
|
}, |
|
|
"hf-inference-providers/Qwen/Qwen3-4B-Instruct-2507": { |
|
|
"params": 4, |
|
|
"url": "https://huggingface.co/Qwen/Qwen3-4B" |
|
|
}, |
|
|
"hf-inference-providers/allenai/Olmo-3-7B-Instruct": { |
|
|
"params": 7, |
|
|
"url": "https://huggingface.co/allenai/OLMo-2-0325-32B-Instruct" |
|
|
}, |
|
|
} |
|
|
df_raw["param_size_b"] = df_raw["model"].apply(lambda x: model_info.get(x, {}).get("params")) |
|
|
df_raw["model_url"] = df_raw["model"].apply(lambda x: model_info.get(x, {}).get("url", "")) |
|
|
|
|
|
df_raw |
|
|
return df_raw, get_task_category, model_info |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _(alt, df_raw, mo): |
|
|
def make_task_content(task_name): |
|
|
"""Generate the complete results view for a task.""" |
|
|
df = df_raw[df_raw["task_category"] == task_name].copy() |
|
|
|
|
|
|
|
|
vlm_avg = df[df["approach"] == "VLM"]["accuracy"].mean() |
|
|
text_avg = df[df["approach"] == "Text"]["accuracy"].mean() |
|
|
diff = vlm_avg - text_avg |
|
|
|
|
|
task_desc = "book titles" if task_name == "Title Extraction" else "full metadata (title, subtitle, publisher, year, ISBN)" |
|
|
|
|
|
|
|
|
results_md = mo.md( |
|
|
f""" |
|
|
### Summary |
|
|
|
|
|
| Approach | Average Accuracy | |
|
|
|----------|-----------------| |
|
|
| **VLM (Vision)** | **{vlm_avg:.0f}%** | |
|
|
| Text Extraction | {text_avg:.0f}% | |
|
|
|
|
|
**VLM advantage: +{diff:.0f} percentage points** |
|
|
|
|
|
VLMs {'significantly ' if diff > 15 else ''}outperform text extraction for extracting {task_desc}. |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
chart = alt.Chart(df).mark_circle(size=200, opacity=0.8).encode( |
|
|
x=alt.X("param_size_b:Q", title="Parameters (Billions)", scale=alt.Scale(zero=False)), |
|
|
y=alt.Y("accuracy:Q", title="Accuracy (%)", scale=alt.Scale(domain=[50, 105])), |
|
|
color=alt.Color("approach:N", title="Approach", scale=alt.Scale(domain=["VLM", "Text"], range=["#1f77b4", "#ff7f0e"])), |
|
|
tooltip=[ |
|
|
alt.Tooltip("model_short:N", title="Model"), |
|
|
alt.Tooltip("approach:N", title="Approach"), |
|
|
alt.Tooltip("param_size_b:Q", title="Params (B)"), |
|
|
alt.Tooltip("accuracy:Q", title="Accuracy", format=".1f"), |
|
|
], |
|
|
).properties( |
|
|
width=500, |
|
|
height=300, |
|
|
title="Model Size vs Accuracy" |
|
|
).configure_axis( |
|
|
labelFontSize=12, |
|
|
titleFontSize=14, |
|
|
) |
|
|
|
|
|
|
|
|
leaderboard_md = "### Model Leaderboard\n\n| Model | Approach | Params (B) | Accuracy (%) |\n|-------|----------|------------|-------------|\n" |
|
|
for _, row in df.sort_values("accuracy", ascending=False).iterrows(): |
|
|
model_link = f"[{row['model_short']}]({row['model_url']})" if row['model_url'] else row['model_short'] |
|
|
leaderboard_md += f"| {model_link} | {row['approach']} | {row['param_size_b']} | {row['accuracy']:.1f} |\n" |
|
|
|
|
|
return mo.vstack([ |
|
|
results_md, |
|
|
mo.md("### Model Size vs Accuracy"), |
|
|
mo.as_html(chart), |
|
|
mo.md("*Hover over points to see model details*"), |
|
|
mo.md(leaderboard_md), |
|
|
]) |
|
|
|
|
|
|
|
|
tabs = mo.ui.tabs({ |
|
|
"π Title Extraction": make_task_content("Title Extraction"), |
|
|
"π Full Metadata": make_task_content("Full Metadata"), |
|
|
}) |
|
|
|
|
|
tabs |
|
|
return make_task_content, tabs |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _(mo): |
|
|
mo.md( |
|
|
""" |
|
|
--- |
|
|
|
|
|
## Why VLMs Win |
|
|
|
|
|
Book covers are **visually structured** documents: |
|
|
|
|
|
- **Spatial layout**: Titles appear in specific locations (usually top/center) |
|
|
- **Typography**: Larger text = more important (likely the title) |
|
|
- **Visual hierarchy**: Authors, publishers, and other info have distinct styling |
|
|
|
|
|
When you extract text first (OCR), you **flatten this structure** into a linear sequence. The model loses the visual cues that make it obvious what's a title vs. a subtitle vs. author name. |
|
|
|
|
|
**Interesting finding**: Qwen3-VL-8B achieves 94% even when used as a text-only model, suggesting it has strong general text understanding - but it still does better (98%) when given the actual images. |
|
|
""" |
|
|
) |
|
|
return |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _(mo): |
|
|
mo.md( |
|
|
""" |
|
|
## The Dataset |
|
|
|
|
|
We use the [DOAB Metadata Extraction](https://huggingface.co/datasets/biglam/doab-metadata-extraction) dataset - academic book covers from the Directory of Open Access Books. |
|
|
|
|
|
Each sample has: |
|
|
- Cover image (rendered from PDF) |
|
|
- Pre-extracted page text |
|
|
- Ground truth metadata (title, subtitle, publisher, year, ISBN) |
|
|
""" |
|
|
) |
|
|
return |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _(mo): |
|
|
mo.Html( |
|
|
""" |
|
|
<iframe |
|
|
src="https://huggingface.co/datasets/biglam/doab-metadata-extraction/embed/viewer/default/train" |
|
|
frameborder="0" |
|
|
width="100%" |
|
|
height="400px" |
|
|
></iframe> |
|
|
""" |
|
|
) |
|
|
return |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _(mo): |
|
|
mo.md( |
|
|
""" |
|
|
## Methodology |
|
|
|
|
|
**Evaluation Framework**: [Inspect AI](https://inspect.aisi.org.uk/) - an open-source framework for evaluating language models |
|
|
|
|
|
**Sample Size**: 50 books (randomly sampled with fixed seed for reproducibility) |
|
|
|
|
|
**Scoring Methods**: |
|
|
- *Title Extraction*: Custom flexible matching scorer |
|
|
- Case-insensitive comparison |
|
|
- Accepts if ground truth is substring of prediction (handles subtitles) |
|
|
- More robust than exact match for this task |
|
|
- *Full Metadata*: LLM-as-judge with partial credit |
|
|
- Correct (1.0): Title + year + at least one other field |
|
|
- Partial (0.5): Some fields correct |
|
|
- Incorrect (0.0): Mostly wrong |
|
|
|
|
|
**Models via**: [HuggingFace Inference Providers](https://huggingface.co/docs/inference-providers) |
|
|
|
|
|
--- |
|
|
|
|
|
## Replicate This |
|
|
|
|
|
The evaluation logs are stored on HuggingFace and can be loaded directly: |
|
|
|
|
|
```python |
|
|
from inspect_ai.analysis import evals_df |
|
|
|
|
|
df = evals_df("hf://datasets/davanstrien/doab-title-extraction-evals") |
|
|
``` |
|
|
|
|
|
--- |
|
|
|
|
|
*Built with [Marimo](https://marimo.io) | Evaluation framework: [Inspect AI](https://inspect.aisi.org.uk/) | Dataset: [biglam/doab-metadata-extraction](https://huggingface.co/datasets/biglam/doab-metadata-extraction)* |
|
|
""" |
|
|
) |
|
|
return |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
app.run() |
|
|
|