llm-benchmark/benchmark_v2.py

#!/usr/bin/env python3
"""
LLM Benchmark v2 — Qwen3.5 9B/27B vs GPT-OSS 20B/120B

Blöcke: Code (A1–A3) · Business-Deutsch (B1–B3) · Agentic Coding (C1)
Metriken: TTFT · Thinking-Zeit · tok/s · Gesamtzeit

Verwendung:
  python benchmark_v2.py Qwen3.5-9B Qwen3.5-27B GPT-OSS-20B GPT-OSS-120B
  python benchmark_v2.py model1 model2
  python benchmark_v2.py model1 --results-dir /tmp/bench
"""

import argparse
import asyncio
import json
import re
import sys
import time
from dataclasses import asdict, dataclass
from datetime import datetime
from pathlib import Path
from typing import Optional

import httpx
from rich.console import Console
from rich.progress import (
    BarColumn,
    MofNCompleteColumn,
    Progress,
    SpinnerColumn,
    TaskID,
    TextColumn,
    TimeElapsedColumn,
)
from rich import box
from rich.table import Table

console = Console()

# ─────────────────────────────────────────────────────────────────────────────
# Konfiguration
# ─────────────────────────────────────────────────────────────────────────────

BACKENDS: dict[str, str] = {
    "vllm":     "http://localhost:8000/v1",
    "ollama":   "http://localhost:11434/v1",
    "lmstudio": "http://localhost:1234/v1",
}

BASE_URL = BACKENDS["vllm"]  # wird in main() überschrieben
DEFAULT_TIMEOUT = 300.0
MAX_RETRIES = 3

PROMPTS: dict[str, dict] = {
    "A1": {
        "block": "Code",
        "label": "Sortierfunktion mit fehlenden Schlüsseln",
        "text": (
            "Schreibe eine Python-Funktion, die eine Liste von Wörterbüchern nach einem beliebigen "
            "Schlüssel sortiert – aufsteigend und absteigend – und dabei fehlende Schlüssel "
            "graceful behandelt. Füge Typ-Annotationen und einen kurzen Docstring auf Deutsch hinzu."
        ),
    },
    "A2": {
        "block": "Code",
        "label": "CSV-Debugging",
        "text": (
            "Der folgende Python-Code soll eine CSV-Datei einlesen und den Durchschnitt einer "
            "Spalte berechnen, hat aber mehrere Fehler. Finde und erkläre alle Fehler auf Deutsch, "
            "dann liefere den korrigierten Code:\n\n"
            "import csv\n"
            "def berechne_durchschnitt(datei, spalte):\n"
            "    werte = []\n"
            "    with open(datei) as f:\n"
            "        reader = csv.reader(f)\n"
            "        for zeile in reader:\n"
            "            werte.append(zeile[spalte])\n"
            "    return sum(werte) / len(werte)"
        ),
    },
    "A3": {
        "block": "Code",
        "label": "HTTP-API-Client",
        "text": (
            "Schreibe eine Python-Klasse für einen einfachen HTTP-API-Client mit:\n"
            "- GET und POST Methoden\n"
            "- automatischem Retry bei 429 und 5xx Fehlern (max. 3 Versuche, exponential backoff)\n"
            "- Logging auf Deutsch\n"
            "- Typ-Annotationen\n"
            "Nutze nur die Standardbibliothek + requests."
        ),
    },
    "B1": {
        "block": "Business",
        "label": "MoE-Erklärung für Geschäftskunden",
        "text": (
            'Erkläre einem nicht-technischen Geschäftskunden in 3–4 Sätzen, was "Mixture of '
            "Experts\" bei KI-Modellen bedeutet und warum das für ihn als Anwender relevant sein "
            "könnte."
        ),
    },
    "B2": {
        "block": "Business",
        "label": "E-Mail-Absage",
        "text": (
            "Formuliere eine professionelle E-Mail-Absage (ca. 80 Wörter) an einen Dienstleister, "
            "der ein zu teures Angebot für eine KI-Implementierung eingereicht hat. "
            "Ton: höflich, klar, Tür offen lassen für die Zukunft."
        ),
    },
    "B3": {
        "block": "Business",
        "label": "revDSG-Argumente",
        "text": (
            "Nenne drei konkrete Argumente, warum ein Schweizer KMU seine Kundendaten NICHT in "
            "eine US-amerikanische Cloud-KI-Lösung geben sollte – aus Sicht des revDSG. "
            "Antworte prägnant und fachlich korrekt."
        ),
    },
}


# ─────────────────────────────────────────────────────────────────────────────
# Datenstrukturen
# ─────────────────────────────────────────────────────────────────────────────


@dataclass
class RunResult:
    prompt_id: str
    model: str
    ttft_s: float
    thinking_time_s: float
    total_time_s: float
    total_tokens: int
    tokens_per_sec: float
    raw_response: str
    visible_response: str
    error: Optional[str] = None


# ─────────────────────────────────────────────────────────────────────────────
# Stream-Prozessor mit <think>-Erkennung
# ─────────────────────────────────────────────────────────────────────────────


class StreamProcessor:
    """Verarbeitet Streaming-Output in Echtzeit und erkennt <think>-Blöcke."""

    _OPEN = "<think>"
    _CLOSE = "</think>"

    def __init__(self) -> None:
        self._chunks: list[str] = []
        self._buffer: str = ""
        self._state: str = "init"  # init | in_think | visible
        self.first_token_time: Optional[float] = None
        self.first_visible_time: Optional[float] = None
        self.think_start_time: Optional[float] = None
        self.think_end_time: Optional[float] = None

    def feed(self, chunk: str, ts: float) -> None:
        if not chunk:
            return
        self._chunks.append(chunk)
        if self.first_token_time is None:
            self.first_token_time = ts
        self._buffer += chunk
        self._advance(ts)

    def _advance(self, ts: float) -> None:
        """Zustandsmaschine: erkennt <think> und </think> Grenzen im Puffer."""
        if self._state == "init":
            if self._OPEN in self._buffer:
                pre = self._buffer[: self._buffer.index(self._OPEN)]
                if pre.strip() and self.first_visible_time is None:
                    self.first_visible_time = self.first_token_time
                self.think_start_time = self.first_token_time
                self._buffer = self._buffer[
                    self._buffer.index(self._OPEN) + len(self._OPEN) :
                ]
                self._state = "in_think"
            elif len(self._buffer) > len(self._OPEN) + 3:
                # Kein <think>-Tag im Anflug → direkt sichtbar
                self._state = "visible"
                if self.first_visible_time is None:
                    self.first_visible_time = self.first_token_time

        if self._state == "in_think":
            if self._CLOSE in self._buffer:
                self.think_end_time = ts
                rest = self._buffer[
                    self._buffer.index(self._CLOSE) + len(self._CLOSE) :
                ]
                self._buffer = rest
                self._state = "visible"
                if rest.strip() and self.first_visible_time is None:
                    self.first_visible_time = ts

        if self._state == "visible":
            if self.first_visible_time is None and self._buffer.strip():
                self.first_visible_time = ts

    @property
    def full_response(self) -> str:
        return "".join(self._chunks)

    @property
    def visible_response(self) -> str:
        """Vollständige Antwort ohne <think>…</think>-Blöcke."""
        return re.sub(
            r"<think>.*?</think>", "", self.full_response, flags=re.DOTALL
        ).strip()

    @property
    def thinking_time(self) -> float:
        if self.think_start_time and self.think_end_time:
            return self.think_end_time - self.think_start_time
        return 0.0


# ─────────────────────────────────────────────────────────────────────────────
# Inference (Streaming + Retry)
# ─────────────────────────────────────────────────────────────────────────────


async def infer(
    client: httpx.AsyncClient,
    model: str,
    prompt_id: str,
    prompt_text: str,
) -> RunResult:
    """Sendet einen Prompt ans Modell und misst alle Metriken per Streaming."""
    payload = {
        "model": model,
        "messages": [{"role": "user", "content": prompt_text}],
        "stream": True,
        "stream_options": {"include_usage": True},
    }

    last_exc: Optional[Exception] = None

    for attempt in range(MAX_RETRIES):
        proc = StreamProcessor()
        total_tokens = 0
        start_time = time.perf_counter()

        try:
            async with client.stream(
                "POST",
                f"{BASE_URL}/chat/completions",
                json=payload,
                timeout=DEFAULT_TIMEOUT,
            ) as resp:
                resp.raise_for_status()
                async for line in resp.aiter_lines():
                    if not line.startswith("data: "):
                        continue
                    data = line[6:]
                    if data == "[DONE]":
                        break
                    try:
                        chunk = json.loads(data)
                    except json.JSONDecodeError:
                        continue

                    ts = time.perf_counter()

                    if usage := chunk.get("usage"):
                        total_tokens = usage.get("completion_tokens", 0)

                    for choice in chunk.get("choices", []):
                        delta_content = (choice.get("delta") or {}).get("content") or ""
                        if delta_content:
                            proc.feed(delta_content, ts)

            end_time = time.perf_counter()
            ttft = (proc.first_token_time - start_time) if proc.first_token_time else 0.0

            # Fallback: Chunk-Anzahl wenn usage fehlt
            if total_tokens == 0:
                total_tokens = len(proc._chunks)

            gen_time = end_time - (proc.first_token_time or start_time)
            tps = total_tokens / gen_time if gen_time > 0 else 0.0

            return RunResult(
                prompt_id=prompt_id,
                model=model,
                ttft_s=round(ttft, 3),
                thinking_time_s=round(proc.thinking_time, 3),
                total_time_s=round(end_time - start_time, 3),
                total_tokens=total_tokens,
                tokens_per_sec=round(tps, 2),
                raw_response=proc.full_response,
                visible_response=proc.visible_response,
            )

        except (httpx.TimeoutException, httpx.ConnectError, httpx.HTTPStatusError) as exc:
            last_exc = exc
            if attempt < MAX_RETRIES - 1:
                wait = 2**attempt
                console.print(
                    f"    [yellow]⚠ Versuch {attempt + 1}/{MAX_RETRIES} fehlgeschlagen "
                    f"({exc.__class__.__name__}), warte {wait}s …[/yellow]"
                )
                await asyncio.sleep(wait)

    return RunResult(
        prompt_id=prompt_id,
        model=model,
        ttft_s=0.0,
        thinking_time_s=0.0,
        total_time_s=0.0,
        total_tokens=0,
        tokens_per_sec=0.0,
        raw_response="",
        visible_response="",
        error=str(last_exc),
    )


# ─────────────────────────────────────────────────────────────────────────────
# Modell-Benchmark
# ─────────────────────────────────────────────────────────────────────────────


async def benchmark_model(
    model: str,
    results_dir: Path,
    progress: Progress,
    task_id: TaskID,
) -> list[RunResult]:
    """Führt alle Prompts für ein Modell aus und speichert Rohausgaben."""
    safe_name = re.sub(r"[^\w\-.]", "_", model.split("/")[-1])
    model_dir = results_dir / safe_name
    model_dir.mkdir(parents=True, exist_ok=True)

    results: list[RunResult] = []

    async with httpx.AsyncClient() as client:
        # Erreichbarkeit prüfen
        try:
            r = await client.get(f"{BASE_URL}/models", timeout=5.0)
            r.raise_for_status()
        except Exception as exc:
            console.print(f"  [red]✗ Endpunkt nicht erreichbar: {exc}[/red]")
            return []

        for prompt_id, prompt_data in PROMPTS.items():
            progress.update(
                task_id,
                description=(
                    f"[cyan]{safe_name[:25]}[/cyan] — "
                    f"[bold]{prompt_id}[/bold] {prompt_data['label'][:35]}"
                ),
            )

            result = await infer(client, model, prompt_id, prompt_data["text"])
            results.append(result)

            # Rohausgabe speichern
            (model_dir / f"{prompt_id}.txt").write_text(
                result.raw_response or f"[FEHLER: {result.error}]",
                encoding="utf-8",
            )

            # Konsolenstatus
            if result.error:
                console.print(f"  {prompt_id}  [red]✗ {result.error[:60]}[/red]")
            else:
                think_str = (
                    f"  Thinking=[cyan]{result.thinking_time_s:.1f}s[/cyan]"
                    if result.thinking_time_s > 0
                    else ""
                )
                console.print(
                    f"  {prompt_id}  [green]✓[/green]  "
                    f"TTFT=[cyan]{result.ttft_s:.2f}s[/cyan]{think_str}  "
                    f"[cyan]{result.tokens_per_sec:.1f}[/cyan] tok/s  "
                    f"Gesamt=[cyan]{result.total_time_s:.1f}s[/cyan]  "
                    f"Tokens=[cyan]{result.total_tokens}[/cyan]"
                )

            progress.advance(task_id)

    # JSON-Export (Rohantworten stehen in .txt)
    json_path = results_dir / f"{safe_name}.json"
    export = {
        "model": model,
        "timestamp": datetime.now().isoformat(),
        "base_url": BASE_URL,
        "results": [
            {k: v for k, v in asdict(r).items() if k not in ("raw_response", "visible_response")}
            for r in results
        ],
    }
    json_path.write_text(
        json.dumps(export, ensure_ascii=False, indent=2), encoding="utf-8"
    )
    console.print(f"  [dim]→ JSON: {json_path}[/dim]")
    console.print(f"  [dim]→ Rohantworten: {model_dir}/[/dim]")

    return results


# ─────────────────────────────────────────────────────────────────────────────
# Markdown-Ausgabe
# ─────────────────────────────────────────────────────────────────────────────


def build_markdown(all_results: dict[str, list[RunResult]]) -> str:
    ts = datetime.now().strftime("%Y-%m-%d %H:%M")
    lines: list[str] = [
        "# LLM Benchmark v2 — Ergebnisse\n",
        f"**Datum:** {ts}  |  **Server:** {BASE_URL}\n",
    ]

    # ── Zusammenfassung ──────────────────────────────────
    lines.append("## Zusammenfassung (Ø über alle Prompts)\n")
    lines.append("| Modell | TTFT (s) | Thinking (s) | tok/s | Gesamt (s) |")
    lines.append("|--------|----------|--------------|-------|------------|")

    for model, results in all_results.items():
        ok = [r for r in results if not r.error]
        if not ok:
            lines.append(f"| `{model}` | — | — | — | — |")
            continue
        avg = lambda key: sum(getattr(r, key) for r in ok) / len(ok)  # noqa: E731
        lines.append(
            f"| `{model}` "
            f"| {avg('ttft_s'):.2f} "
            f"| {avg('thinking_time_s'):.1f} "
            f"| {avg('tokens_per_sec'):.1f} "
            f"| {avg('total_time_s'):.1f} |"
        )

    lines.append("")

    # ── Details pro Prompt ───────────────────────────────
    lines.append("## Details pro Prompt\n")

    for prompt_id, meta in PROMPTS.items():
        lines.append(f"### {prompt_id} — {meta['label']} `[{meta['block']}]`\n")
        lines.append("| Modell | TTFT (s) | Thinking (s) | tok/s | Tokens | Gesamt (s) |")
        lines.append("|--------|----------|--------------|-------|--------|------------|")

        for model, results in all_results.items():
            r = next((x for x in results if x.prompt_id == prompt_id), None)
            if r is None or r.error:
                err = (r.error or "—")[:50] if r else "—"
                lines.append(f"| `{model}` | ✗ | ✗ | ✗ | ✗ | {err} |")
                continue
            lines.append(
                f"| `{model}` "
                f"| {r.ttft_s:.2f} "
                f"| {r.thinking_time_s:.1f} "
                f"| {r.tokens_per_sec:.1f} "
                f"| {r.total_tokens} "
                f"| {r.total_time_s:.1f} |"
            )

        lines.append("")

    return "\n".join(lines)


def print_rich_summary(all_results: dict[str, list[RunResult]]) -> None:
    """Gibt eine Rich-Tabelle mit dem Gesamtüberblick aus."""
    table = Table(
        title="\n[bold]Benchmark v2 — Zusammenfassung[/bold]",
        box=box.ROUNDED,
        header_style="bold magenta",
        show_lines=True,
    )
    table.add_column("Prompt", style="bold white", width=7, no_wrap=True)
    table.add_column("Block", width=9)

    models = list(all_results.keys())
    for m in models:
        short = m.split("/")[-1][:18]
        table.add_column(short, justify="right", width=22)

    for prompt_id, meta in PROMPTS.items():
        row = [prompt_id, meta["block"]]
        for model, results in all_results.items():
            r = next((x for x in results if x.prompt_id == prompt_id), None)
            if r is None or r.error:
                row.append("[red]✗[/red]")
            else:
                think = (
                    f"\n[dim]think={r.thinking_time_s:.1f}s[/dim]"
                    if r.thinking_time_s > 0
                    else ""
                )
                row.append(
                    f"TTFT [cyan]{r.ttft_s:.2f}s[/cyan]\n"
                    f"[cyan]{r.tokens_per_sec:.1f}[/cyan] tok/s{think}"
                )
        table.add_row(*row)

    console.print(table)


async def detect_model() -> Optional[str]:
    """Fragt den vllm-Server nach dem geladenen Modellnamen."""
    try:
        async with httpx.AsyncClient() as client:
            r = await client.get(f"{BASE_URL}/models", timeout=5.0)
            r.raise_for_status()
            models = r.json().get("data", [])
            return models[0]["id"] if models else None
    except Exception as exc:
        console.print(f"[red]✗ Server nicht erreichbar: {exc}[/red]")
        return None


# ─────────────────────────────────────────────────────────────────────────────
# Einstiegspunkt
# ─────────────────────────────────────────────────────────────────────────────


async def main() -> None:
    global BASE_URL

    parser = argparse.ArgumentParser(
        description="LLM Benchmark v2 — Qwen3.5 9B/27B vs GPT-OSS 20B/120B",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=(
            "Beispiele:\n"
            "  python benchmark_v2.py 4\n"
            "  python benchmark_v2.py 2 --backend ollama\n"
            "  python benchmark_v2.py 1 --backend lmstudio\n"
            "  python benchmark_v2.py 2 --url http://localhost:9000/v1\n"
        ),
    )
    parser.add_argument(
        "count", type=int, metavar="ANZAHL",
        help="Anzahl Modelle die getestet werden (z.B. 4)",
    )
    parser.add_argument(
        "--backend", choices=list(BACKENDS.keys()), default="vllm",
        help=f"Backend-Preset: {', '.join(f'{k}={v}' for k, v in BACKENDS.items())}",
    )
    parser.add_argument(
        "--url", default=None, metavar="URL",
        help="Eigene Base-URL (überschreibt --backend), z.B. http://localhost:9000/v1",
    )
    parser.add_argument(
        "--model", default=None, metavar="MODELL",
        help="Modellname explizit angeben (überspringt Auto-Detect), z.B. gemma4:31b",
    )
    parser.add_argument(
        "--results-dir", default="results", metavar="DIR",
        help="Ausgabeverzeichnis (Standard: results/)",
    )
    args = parser.parse_args()

    BASE_URL = args.url if args.url else BACKENDS[args.backend]

    results_dir = Path(args.results_dir)
    results_dir.mkdir(exist_ok=True)

    backend_label = args.url if args.url else args.backend
    console.rule("[bold magenta]LLM Benchmark v2[/bold magenta]")
    model_info = f"[cyan]{args.model}[/cyan] (fest)" if args.model else "[cyan]auto-detect[/cyan]"
    console.print(
        f"Backend:  [cyan]{backend_label}[/cyan]  →  {BASE_URL}\n"
        f"Modell:   {model_info}\n"
        f"Modelle:  [cyan]{args.count}x[/cyan]\n"
        f"Prompts:  [cyan]{len(PROMPTS)}[/cyan] "
        f"(A1–A3 Code · B1–B3 Business)\n"
        f"Output:   [cyan]{results_dir}/[/cyan]\n"
    )

    all_results: dict[str, list[RunResult]] = {}

    with Progress(
        SpinnerColumn(),
        TextColumn("[progress.description]{task.description}"),
        BarColumn(),
        MofNCompleteColumn(),
        TimeElapsedColumn(),
        console=console,
        transient=False,
    ) as progress:
        overall = progress.add_task("[bold]Gesamt[/bold]", total=args.count * len(PROMPTS))

        for idx in range(args.count):
            # Vor jedem Modell: Server abfragen und Modellname ermitteln
            if idx > 0:
                progress.stop()
                console.print()
                console.rule("[bold yellow]Modellwechsel[/bold yellow]")
                console.print(
                    f"\n  Modell {idx + 1}/{args.count}\n\n"
                    f"  1. Starte vllm neu:  [dim]~/scripts/vllm/start_model.sh[/dim]\n"
                    f"  2. Wähle das nächste Modell\n"
                    f"  3. Warte bis der Server bereit ist\n"
                )
                console.print("  Dann hier [bold]Enter[/bold] drücken …")
                input()
                progress.start()

            # Modellname: explizit oder vom Server
            model = args.model or await detect_model()
            if not model:
                console.print(f"[red]✗ Kein Modell auf {BASE_URL} gefunden — abbruch.[/red]")
                break

            console.rule(f"[bold cyan]{model}[/bold cyan]")
            task = progress.add_task("", total=len(PROMPTS))

            results = await benchmark_model(
                model, results_dir, progress=progress, task_id=task
            )
            all_results[model] = results
            progress.advance(overall, advance=len(PROMPTS))

    # Rich-Tabelle im Terminal
    console.print()
    print_rich_summary(all_results)

    # Markdown speichern
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    md_path = results_dir / f"benchmark_v2_{ts}.md"
    md_path.write_text(build_markdown(all_results), encoding="utf-8")

    console.print()
    console.rule("[bold green]Fertig[/bold green]")
    console.print(f"[green]✓[/green] Markdown:     [bold]{md_path}[/bold]")
    console.print(f"[green]✓[/green] JSON + .txt:  [bold]{results_dir}/<modell>/[/bold]")


if __name__ == "__main__":
    try:
        asyncio.run(main())
    except KeyboardInterrupt:
        console.print("\n[yellow]Abgebrochen.[/yellow]")
        sys.exit(0)