From b2584565bdbe673f25cf936d7765089da5951d24 Mon Sep 17 00:00:00 2001
From: Michael Gross <info@michaelgross.net>
Date: Mon, 6 Apr 2026 17:44:30 +0200
Subject: [PATCH] Add benchmark_v2.py

---
 benchmark_v2.py | 641 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 641 insertions(+)
 create mode 100644 benchmark_v2.py

diff --git a/benchmark_v2.py b/benchmark_v2.py
new file mode 100644
index 0000000..028ffc4
--- /dev/null
+++ b/benchmark_v2.py
@@ -0,0 +1,641 @@
+#!/usr/bin/env python3
+"""
+LLM Benchmark v2 — Qwen3.5 9B/27B vs GPT-OSS 20B/120B
+
+Blöcke: Code (A1–A3) · Business-Deutsch (B1–B3) · Agentic Coding (C1)
+Metriken: TTFT · Thinking-Zeit · tok/s · Gesamtzeit
+
+Verwendung:
+  python benchmark_v2.py Qwen3.5-9B Qwen3.5-27B GPT-OSS-20B GPT-OSS-120B
+  python benchmark_v2.py model1 model2
+  python benchmark_v2.py model1 --results-dir /tmp/bench
+"""
+
+import argparse
+import asyncio
+import json
+import re
+import sys
+import time
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+import httpx
+from rich.console import Console
+from rich.progress import (
+    BarColumn,
+    MofNCompleteColumn,
+    Progress,
+    SpinnerColumn,
+    TaskID,
+    TextColumn,
+    TimeElapsedColumn,
+)
+from rich import box
+from rich.table import Table
+
+console = Console()
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Konfiguration
+# ─────────────────────────────────────────────────────────────────────────────
+
+BACKENDS: dict[str, str] = {
+    "vllm":     "http://localhost:8000/v1",
+    "ollama":   "http://localhost:11434/v1",
+    "lmstudio": "http://localhost:1234/v1",
+}
+
+BASE_URL = BACKENDS["vllm"]  # wird in main() überschrieben
+DEFAULT_TIMEOUT = 300.0
+MAX_RETRIES = 3
+
+PROMPTS: dict[str, dict] = {
+    "A1": {
+        "block": "Code",
+        "label": "Sortierfunktion mit fehlenden Schlüsseln",
+        "text": (
+            "Schreibe eine Python-Funktion, die eine Liste von Wörterbüchern nach einem beliebigen "
+            "Schlüssel sortiert – aufsteigend und absteigend – und dabei fehlende Schlüssel "
+            "graceful behandelt. Füge Typ-Annotationen und einen kurzen Docstring auf Deutsch hinzu."
+        ),
+    },
+    "A2": {
+        "block": "Code",
+        "label": "CSV-Debugging",
+        "text": (
+            "Der folgende Python-Code soll eine CSV-Datei einlesen und den Durchschnitt einer "
+            "Spalte berechnen, hat aber mehrere Fehler. Finde und erkläre alle Fehler auf Deutsch, "
+            "dann liefere den korrigierten Code:\n\n"
+            "import csv\n"
+            "def berechne_durchschnitt(datei, spalte):\n"
+            "    werte = []\n"
+            "    with open(datei) as f:\n"
+            "        reader = csv.reader(f)\n"
+            "        for zeile in reader:\n"
+            "            werte.append(zeile[spalte])\n"
+            "    return sum(werte) / len(werte)"
+        ),
+    },
+    "A3": {
+        "block": "Code",
+        "label": "HTTP-API-Client",
+        "text": (
+            "Schreibe eine Python-Klasse für einen einfachen HTTP-API-Client mit:\n"
+            "- GET und POST Methoden\n"
+            "- automatischem Retry bei 429 und 5xx Fehlern (max. 3 Versuche, exponential backoff)\n"
+            "- Logging auf Deutsch\n"
+            "- Typ-Annotationen\n"
+            "Nutze nur die Standardbibliothek + requests."
+        ),
+    },
+    "B1": {
+        "block": "Business",
+        "label": "MoE-Erklärung für Geschäftskunden",
+        "text": (
+            'Erkläre einem nicht-technischen Geschäftskunden in 3–4 Sätzen, was "Mixture of '
+            "Experts\" bei KI-Modellen bedeutet und warum das für ihn als Anwender relevant sein "
+            "könnte."
+        ),
+    },
+    "B2": {
+        "block": "Business",
+        "label": "E-Mail-Absage",
+        "text": (
+            "Formuliere eine professionelle E-Mail-Absage (ca. 80 Wörter) an einen Dienstleister, "
+            "der ein zu teures Angebot für eine KI-Implementierung eingereicht hat. "
+            "Ton: höflich, klar, Tür offen lassen für die Zukunft."
+        ),
+    },
+    "B3": {
+        "block": "Business",
+        "label": "revDSG-Argumente",
+        "text": (
+            "Nenne drei konkrete Argumente, warum ein Schweizer KMU seine Kundendaten NICHT in "
+            "eine US-amerikanische Cloud-KI-Lösung geben sollte – aus Sicht des revDSG. "
+            "Antworte prägnant und fachlich korrekt."
+        ),
+    },
+}
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Datenstrukturen
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+@dataclass
+class RunResult:
+    prompt_id: str
+    model: str
+    ttft_s: float
+    thinking_time_s: float
+    total_time_s: float
+    total_tokens: int
+    tokens_per_sec: float
+    raw_response: str
+    visible_response: str
+    error: Optional[str] = None
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Stream-Prozessor mit <think>-Erkennung
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+class StreamProcessor:
+    """Verarbeitet Streaming-Output in Echtzeit und erkennt <think>-Blöcke."""
+
+    _OPEN = "<think>"
+    _CLOSE = "</think>"
+
+    def __init__(self) -> None:
+        self._chunks: list[str] = []
+        self._buffer: str = ""
+        self._state: str = "init"  # init | in_think | visible
+        self.first_token_time: Optional[float] = None
+        self.first_visible_time: Optional[float] = None
+        self.think_start_time: Optional[float] = None
+        self.think_end_time: Optional[float] = None
+
+    def feed(self, chunk: str, ts: float) -> None:
+        if not chunk:
+            return
+        self._chunks.append(chunk)
+        if self.first_token_time is None:
+            self.first_token_time = ts
+        self._buffer += chunk
+        self._advance(ts)
+
+    def _advance(self, ts: float) -> None:
+        """Zustandsmaschine: erkennt <think> und </think> Grenzen im Puffer."""
+        if self._state == "init":
+            if self._OPEN in self._buffer:
+                pre = self._buffer[: self._buffer.index(self._OPEN)]
+                if pre.strip() and self.first_visible_time is None:
+                    self.first_visible_time = self.first_token_time
+                self.think_start_time = self.first_token_time
+                self._buffer = self._buffer[
+                    self._buffer.index(self._OPEN) + len(self._OPEN) :
+                ]
+                self._state = "in_think"
+            elif len(self._buffer) > len(self._OPEN) + 3:
+                # Kein <think>-Tag im Anflug → direkt sichtbar
+                self._state = "visible"
+                if self.first_visible_time is None:
+                    self.first_visible_time = self.first_token_time
+
+        if self._state == "in_think":
+            if self._CLOSE in self._buffer:
+                self.think_end_time = ts
+                rest = self._buffer[
+                    self._buffer.index(self._CLOSE) + len(self._CLOSE) :
+                ]
+                self._buffer = rest
+                self._state = "visible"
+                if rest.strip() and self.first_visible_time is None:
+                    self.first_visible_time = ts
+
+        if self._state == "visible":
+            if self.first_visible_time is None and self._buffer.strip():
+                self.first_visible_time = ts
+
+    @property
+    def full_response(self) -> str:
+        return "".join(self._chunks)
+
+    @property
+    def visible_response(self) -> str:
+        """Vollständige Antwort ohne <think>…</think>-Blöcke."""
+        return re.sub(
+            r"<think>.*?</think>", "", self.full_response, flags=re.DOTALL
+        ).strip()
+
+    @property
+    def thinking_time(self) -> float:
+        if self.think_start_time and self.think_end_time:
+            return self.think_end_time - self.think_start_time
+        return 0.0
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Inference (Streaming + Retry)
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+async def infer(
+    client: httpx.AsyncClient,
+    model: str,
+    prompt_id: str,
+    prompt_text: str,
+) -> RunResult:
+    """Sendet einen Prompt ans Modell und misst alle Metriken per Streaming."""
+    payload = {
+        "model": model,
+        "messages": [{"role": "user", "content": prompt_text}],
+        "stream": True,
+        "stream_options": {"include_usage": True},
+    }
+
+    last_exc: Optional[Exception] = None
+
+    for attempt in range(MAX_RETRIES):
+        proc = StreamProcessor()
+        total_tokens = 0
+        start_time = time.perf_counter()
+
+        try:
+            async with client.stream(
+                "POST",
+                f"{BASE_URL}/chat/completions",
+                json=payload,
+                timeout=DEFAULT_TIMEOUT,
+            ) as resp:
+                resp.raise_for_status()
+                async for line in resp.aiter_lines():
+                    if not line.startswith("data: "):
+                        continue
+                    data = line[6:]
+                    if data == "[DONE]":
+                        break
+                    try:
+                        chunk = json.loads(data)
+                    except json.JSONDecodeError:
+                        continue
+
+                    ts = time.perf_counter()
+
+                    if usage := chunk.get("usage"):
+                        total_tokens = usage.get("completion_tokens", 0)
+
+                    for choice in chunk.get("choices", []):
+                        delta_content = (choice.get("delta") or {}).get("content") or ""
+                        if delta_content:
+                            proc.feed(delta_content, ts)
+
+            end_time = time.perf_counter()
+            ttft = (proc.first_token_time - start_time) if proc.first_token_time else 0.0
+
+            # Fallback: Chunk-Anzahl wenn usage fehlt
+            if total_tokens == 0:
+                total_tokens = len(proc._chunks)
+
+            gen_time = end_time - (proc.first_token_time or start_time)
+            tps = total_tokens / gen_time if gen_time > 0 else 0.0
+
+            return RunResult(
+                prompt_id=prompt_id,
+                model=model,
+                ttft_s=round(ttft, 3),
+                thinking_time_s=round(proc.thinking_time, 3),
+                total_time_s=round(end_time - start_time, 3),
+                total_tokens=total_tokens,
+                tokens_per_sec=round(tps, 2),
+                raw_response=proc.full_response,
+                visible_response=proc.visible_response,
+            )
+
+        except (httpx.TimeoutException, httpx.ConnectError, httpx.HTTPStatusError) as exc:
+            last_exc = exc
+            if attempt < MAX_RETRIES - 1:
+                wait = 2**attempt
+                console.print(
+                    f"    [yellow]⚠ Versuch {attempt + 1}/{MAX_RETRIES} fehlgeschlagen "
+                    f"({exc.__class__.__name__}), warte {wait}s …[/yellow]"
+                )
+                await asyncio.sleep(wait)
+
+    return RunResult(
+        prompt_id=prompt_id,
+        model=model,
+        ttft_s=0.0,
+        thinking_time_s=0.0,
+        total_time_s=0.0,
+        total_tokens=0,
+        tokens_per_sec=0.0,
+        raw_response="",
+        visible_response="",
+        error=str(last_exc),
+    )
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Modell-Benchmark
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+async def benchmark_model(
+    model: str,
+    results_dir: Path,
+    progress: Progress,
+    task_id: TaskID,
+) -> list[RunResult]:
+    """Führt alle Prompts für ein Modell aus und speichert Rohausgaben."""
+    safe_name = re.sub(r"[^\w\-.]", "_", model.split("/")[-1])
+    model_dir = results_dir / safe_name
+    model_dir.mkdir(parents=True, exist_ok=True)
+
+    results: list[RunResult] = []
+
+    async with httpx.AsyncClient() as client:
+        # Erreichbarkeit prüfen
+        try:
+            r = await client.get(f"{BASE_URL}/models", timeout=5.0)
+            r.raise_for_status()
+        except Exception as exc:
+            console.print(f"  [red]✗ Endpunkt nicht erreichbar: {exc}[/red]")
+            return []
+
+        for prompt_id, prompt_data in PROMPTS.items():
+            progress.update(
+                task_id,
+                description=(
+                    f"[cyan]{safe_name[:25]}[/cyan] — "
+                    f"[bold]{prompt_id}[/bold] {prompt_data['label'][:35]}"
+                ),
+            )
+
+            result = await infer(client, model, prompt_id, prompt_data["text"])
+            results.append(result)
+
+            # Rohausgabe speichern
+            (model_dir / f"{prompt_id}.txt").write_text(
+                result.raw_response or f"[FEHLER: {result.error}]",
+                encoding="utf-8",
+            )
+
+            # Konsolenstatus
+            if result.error:
+                console.print(f"  {prompt_id}  [red]✗ {result.error[:60]}[/red]")
+            else:
+                think_str = (
+                    f"  Thinking=[cyan]{result.thinking_time_s:.1f}s[/cyan]"
+                    if result.thinking_time_s > 0
+                    else ""
+                )
+                console.print(
+                    f"  {prompt_id}  [green]✓[/green]  "
+                    f"TTFT=[cyan]{result.ttft_s:.2f}s[/cyan]{think_str}  "
+                    f"[cyan]{result.tokens_per_sec:.1f}[/cyan] tok/s  "
+                    f"Gesamt=[cyan]{result.total_time_s:.1f}s[/cyan]  "
+                    f"Tokens=[cyan]{result.total_tokens}[/cyan]"
+                )
+
+            progress.advance(task_id)
+
+    # JSON-Export (Rohantworten stehen in .txt)
+    json_path = results_dir / f"{safe_name}.json"
+    export = {
+        "model": model,
+        "timestamp": datetime.now().isoformat(),
+        "base_url": BASE_URL,
+        "results": [
+            {k: v for k, v in asdict(r).items() if k not in ("raw_response", "visible_response")}
+            for r in results
+        ],
+    }
+    json_path.write_text(
+        json.dumps(export, ensure_ascii=False, indent=2), encoding="utf-8"
+    )
+    console.print(f"  [dim]→ JSON: {json_path}[/dim]")
+    console.print(f"  [dim]→ Rohantworten: {model_dir}/[/dim]")
+
+    return results
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Markdown-Ausgabe
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+def build_markdown(all_results: dict[str, list[RunResult]]) -> str:
+    ts = datetime.now().strftime("%Y-%m-%d %H:%M")
+    lines: list[str] = [
+        "# LLM Benchmark v2 — Ergebnisse\n",
+        f"**Datum:** {ts}  |  **Server:** {BASE_URL}\n",
+    ]
+
+    # ── Zusammenfassung ──────────────────────────────────
+    lines.append("## Zusammenfassung (Ø über alle Prompts)\n")
+    lines.append("| Modell | TTFT (s) | Thinking (s) | tok/s | Gesamt (s) |")
+    lines.append("|--------|----------|--------------|-------|------------|")
+
+    for model, results in all_results.items():
+        ok = [r for r in results if not r.error]
+        if not ok:
+            lines.append(f"| `{model}` | — | — | — | — |")
+            continue
+        avg = lambda key: sum(getattr(r, key) for r in ok) / len(ok)  # noqa: E731
+        lines.append(
+            f"| `{model}` "
+            f"| {avg('ttft_s'):.2f} "
+            f"| {avg('thinking_time_s'):.1f} "
+            f"| {avg('tokens_per_sec'):.1f} "
+            f"| {avg('total_time_s'):.1f} |"
+        )
+
+    lines.append("")
+
+    # ── Details pro Prompt ───────────────────────────────
+    lines.append("## Details pro Prompt\n")
+
+    for prompt_id, meta in PROMPTS.items():
+        lines.append(f"### {prompt_id} — {meta['label']} `[{meta['block']}]`\n")
+        lines.append("| Modell | TTFT (s) | Thinking (s) | tok/s | Tokens | Gesamt (s) |")
+        lines.append("|--------|----------|--------------|-------|--------|------------|")
+
+        for model, results in all_results.items():
+            r = next((x for x in results if x.prompt_id == prompt_id), None)
+            if r is None or r.error:
+                err = (r.error or "—")[:50] if r else "—"
+                lines.append(f"| `{model}` | ✗ | ✗ | ✗ | ✗ | {err} |")
+                continue
+            lines.append(
+                f"| `{model}` "
+                f"| {r.ttft_s:.2f} "
+                f"| {r.thinking_time_s:.1f} "
+                f"| {r.tokens_per_sec:.1f} "
+                f"| {r.total_tokens} "
+                f"| {r.total_time_s:.1f} |"
+            )
+
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+def print_rich_summary(all_results: dict[str, list[RunResult]]) -> None:
+    """Gibt eine Rich-Tabelle mit dem Gesamtüberblick aus."""
+    table = Table(
+        title="\n[bold]Benchmark v2 — Zusammenfassung[/bold]",
+        box=box.ROUNDED,
+        header_style="bold magenta",
+        show_lines=True,
+    )
+    table.add_column("Prompt", style="bold white", width=7, no_wrap=True)
+    table.add_column("Block", width=9)
+
+    models = list(all_results.keys())
+    for m in models:
+        short = m.split("/")[-1][:18]
+        table.add_column(short, justify="right", width=22)
+
+    for prompt_id, meta in PROMPTS.items():
+        row = [prompt_id, meta["block"]]
+        for model, results in all_results.items():
+            r = next((x for x in results if x.prompt_id == prompt_id), None)
+            if r is None or r.error:
+                row.append("[red]✗[/red]")
+            else:
+                think = (
+                    f"\n[dim]think={r.thinking_time_s:.1f}s[/dim]"
+                    if r.thinking_time_s > 0
+                    else ""
+                )
+                row.append(
+                    f"TTFT [cyan]{r.ttft_s:.2f}s[/cyan]\n"
+                    f"[cyan]{r.tokens_per_sec:.1f}[/cyan] tok/s{think}"
+                )
+        table.add_row(*row)
+
+    console.print(table)
+
+
+async def detect_model() -> Optional[str]:
+    """Fragt den vllm-Server nach dem geladenen Modellnamen."""
+    try:
+        async with httpx.AsyncClient() as client:
+            r = await client.get(f"{BASE_URL}/models", timeout=5.0)
+            r.raise_for_status()
+            models = r.json().get("data", [])
+            return models[0]["id"] if models else None
+    except Exception as exc:
+        console.print(f"[red]✗ Server nicht erreichbar: {exc}[/red]")
+        return None
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Einstiegspunkt
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+async def main() -> None:
+    global BASE_URL
+
+    parser = argparse.ArgumentParser(
+        description="LLM Benchmark v2 — Qwen3.5 9B/27B vs GPT-OSS 20B/120B",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=(
+            "Beispiele:\n"
+            "  python benchmark_v2.py 4\n"
+            "  python benchmark_v2.py 2 --backend ollama\n"
+            "  python benchmark_v2.py 1 --backend lmstudio\n"
+            "  python benchmark_v2.py 2 --url http://localhost:9000/v1\n"
+        ),
+    )
+    parser.add_argument(
+        "count", type=int, metavar="ANZAHL",
+        help="Anzahl Modelle die getestet werden (z.B. 4)",
+    )
+    parser.add_argument(
+        "--backend", choices=list(BACKENDS.keys()), default="vllm",
+        help=f"Backend-Preset: {', '.join(f'{k}={v}' for k, v in BACKENDS.items())}",
+    )
+    parser.add_argument(
+        "--url", default=None, metavar="URL",
+        help="Eigene Base-URL (überschreibt --backend), z.B. http://localhost:9000/v1",
+    )
+    parser.add_argument(
+        "--model", default=None, metavar="MODELL",
+        help="Modellname explizit angeben (überspringt Auto-Detect), z.B. gemma4:31b",
+    )
+    parser.add_argument(
+        "--results-dir", default="results", metavar="DIR",
+        help="Ausgabeverzeichnis (Standard: results/)",
+    )
+    args = parser.parse_args()
+
+    BASE_URL = args.url if args.url else BACKENDS[args.backend]
+
+    results_dir = Path(args.results_dir)
+    results_dir.mkdir(exist_ok=True)
+
+    backend_label = args.url if args.url else args.backend
+    console.rule("[bold magenta]LLM Benchmark v2[/bold magenta]")
+    model_info = f"[cyan]{args.model}[/cyan] (fest)" if args.model else "[cyan]auto-detect[/cyan]"
+    console.print(
+        f"Backend:  [cyan]{backend_label}[/cyan]  →  {BASE_URL}\n"
+        f"Modell:   {model_info}\n"
+        f"Modelle:  [cyan]{args.count}x[/cyan]\n"
+        f"Prompts:  [cyan]{len(PROMPTS)}[/cyan] "
+        f"(A1–A3 Code · B1–B3 Business)\n"
+        f"Output:   [cyan]{results_dir}/[/cyan]\n"
+    )
+
+    all_results: dict[str, list[RunResult]] = {}
+
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        MofNCompleteColumn(),
+        TimeElapsedColumn(),
+        console=console,
+        transient=False,
+    ) as progress:
+        overall = progress.add_task("[bold]Gesamt[/bold]", total=args.count * len(PROMPTS))
+
+        for idx in range(args.count):
+            # Vor jedem Modell: Server abfragen und Modellname ermitteln
+            if idx > 0:
+                progress.stop()
+                console.print()
+                console.rule("[bold yellow]Modellwechsel[/bold yellow]")
+                console.print(
+                    f"\n  Modell {idx + 1}/{args.count}\n\n"
+                    f"  1. Starte vllm neu:  [dim]~/scripts/vllm/start_model.sh[/dim]\n"
+                    f"  2. Wähle das nächste Modell\n"
+                    f"  3. Warte bis der Server bereit ist\n"
+                )
+                console.print("  Dann hier [bold]Enter[/bold] drücken …")
+                input()
+                progress.start()
+
+            # Modellname: explizit oder vom Server
+            model = args.model or await detect_model()
+            if not model:
+                console.print(f"[red]✗ Kein Modell auf {BASE_URL} gefunden — abbruch.[/red]")
+                break
+
+            console.rule(f"[bold cyan]{model}[/bold cyan]")
+            task = progress.add_task("", total=len(PROMPTS))
+
+            results = await benchmark_model(
+                model, results_dir, progress=progress, task_id=task
+            )
+            all_results[model] = results
+            progress.advance(overall, advance=len(PROMPTS))
+
+    # Rich-Tabelle im Terminal
+    console.print()
+    print_rich_summary(all_results)
+
+    # Markdown speichern
+    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+    md_path = results_dir / f"benchmark_v2_{ts}.md"
+    md_path.write_text(build_markdown(all_results), encoding="utf-8")
+
+    console.print()
+    console.rule("[bold green]Fertig[/bold green]")
+    console.print(f"[green]✓[/green] Markdown:     [bold]{md_path}[/bold]")
+    console.print(f"[green]✓[/green] JSON + .txt:  [bold]{results_dir}/<modell>/[/bold]")
+
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        console.print("\n[yellow]Abgebrochen.[/yellow]")
+        sys.exit(0)