diff --git a/docs/rag-web-ui.md b/docs/rag-web-ui.md new file mode 100644 index 00000000..73cafaaf --- /dev/null +++ b/docs/rag-web-ui.md @@ -0,0 +1,78 @@ +# RAG и веб‑UI: архитектура и фазы + +Цель: **не** раздувать `claw-analog` и основной `claw` — вынести индексацию и (позже) UI в отдельные процессы с явными HTTP/MCP контрактами. + +## Принципы + +1. **RAG как сервис** — отдельный бинарь (сейчас `claw-rag-service`), свой жизненный цикл, свои секреты (embedding API), своё хранилище. +2. **Агент только вызывает retrieval** — в **`claw-analog`** инструмент **`retrieve_context`** → HTTP `POST {RAG_BASE_URL}/v1/query` (база без суффикса `/v1`); лимиты **`rag_timeout_secs`**, **`rag_top_k_max`** в `.claw-analog.toml`; ответ для модели — фрагменты с `path` + `snippet` + `score`. +3. **Веб‑UI** — минимальная страница **`GET /`** в `claw-rag-service` (stats + форма `POST /v1/query`); чат с моделью и «переиндексировать» из браузера — при необходимости позже. + +## Компоненты (целевая картина) + +```text +┌─────────────────┐ POST /v1/query ┌──────────────────────┐ +│ claw-analog │ ──────────────────────►│ claw-rag-service │ +│ (+ tool) │◄──────────────────────│ (embed + vector DB) │ +└─────────────────┘ JSON hits └──────────┬───────────┘ + │ + ingest (watch / CLI) + ▼ + workspace files / git tree +``` + +- **Индексация**: отдельная команда или воркер (chunking, хеш файла, инкремент). Хранилище: на старте SQLite + `sqlite-vec` / файловый эмбеддинг-кэш; при росте — Qdrant/Chroma в Docker. +- **Эмбеддинги**: HTTP к OpenAI/Anthropic-совместимому embedding endpoint или локальная модель (отдельное решение по лицензии и размеру). +- **Веб‑UI**: авторизация (минимум: токен + reverse proxy), SSE или WebSocket для стрима ответа модели; UI **не** владеет секретами провайдера, если продукт так решит — прокси через бэкенд. + +## Текущая реализация + +Крейт **`rust/crates/claw-rag-service`** (из каталога `rust/`): + +### HTTP + +- `GET /` — одностраничный UI (встроенный `static/index.html`): счётчики из `/v1/stats`, поиск через `/v1/query`. +- `GET /health` — `ok`. +- `GET /v1/stats` — `{ "chunks": N, "phase": "1-sqlite" }` (если БД ещё нет: `chunks: 0`, `phase`: `1-sqlite-no-db`). +- `POST /v1/query` — тело `{"query":"...", "top_k":8}`; ответ `{"hits":[{"path","snippet","score"}], "phase":"1-sqlite"|"1-sqlite-empty"|"1-sqlite-no-db"}`. + +Поиск: **линейный обход** всех векторов в SQLite (MVP; для больших репозиториев планировать Qdrant/sqlite-vec или батчевый ANN). + +### Индексация (фаза 1) + +```powershell +cd D:\path\to\claw-code-main\rust +$env:OPENAI_API_KEY = "sk-..." +cargo run -p claw-rag-service -- ingest -w D:\path\to\repo --db D:\path\to\index.sqlite +cargo run -p claw-analog -- ... # при RAG_BASE_URL или rag_base_url в TOML — инструмент retrieve_context +``` + +Переменные окружения: + +- **`OPENAI_API_KEY`** или **`CLAW_RAG_OPENAI_API_KEY`** — для вызова `POST …/embeddings`. +- **`CLAW_RAG_EMBEDDING_BASE_URL`** — по умолчанию `https://api.openai.com/v1`. +- **`CLAW_RAG_EMBEDDING_MODEL`** — по умолчанию `text-embedding-3-small`. +- **`CLAW_RAG_DB`** — путь к SQLite (у ingest/`serve`; у `serve` есть default `.claw-rag/index.sqlite`). +- **`CLAW_RAG_PORT`** — порт HTTP (по умолчанию `8787`). +- **`CLAW_RAG_MOCK_PROVIDERS=1`** — детерминированные вектора без сети (для тестов CI). + +Запуск сервера: `cargo run -p claw-rag-service` или `cargo run -p claw-rag-service -- serve --db path\to\index.sqlite`. + +### Дальше по фазам + +| Фаза | Содержание | +|------|------------| +| 1 | ~~Ingest + SQLite + embeddings~~ (базово сделано; улучшения: инкремент, ANN, Docker-векторка). | +| 2 | ~~Инструмент `retrieve_context`~~: `RAG_BASE_URL` / `rag_base_url`, `rag_timeout_secs`, `rag_top_k_max` в `.claw-analog.toml`. | +| 3 | ~~Минимальный UI~~: `GET /` + те же `/v1/*` (дальше: чат, кнопка re-index из UI). | + +## Риски и ограничения + +- Секреты и PII в индексе; размер индекса и стоимость эмбеддингов. +- Согласованность с symlink/jail как в `claw-analog` — retrieval не должен «утекать» за пределы workspace. +- Локаль на UI: i18n отдельно от `AnalogLanguage` в CLI. + +## Связанные документы + +- Локальный запуск контейнеров (если поднимете векторку): [`container.md`](container.md). +- Обзор `claw-analog`: [`how_to_run.md`](../how_to_run.md). diff --git a/rust/crates/claw-rag-service/Cargo.toml b/rust/crates/claw-rag-service/Cargo.toml new file mode 100644 index 00000000..b5098dba --- /dev/null +++ b/rust/crates/claw-rag-service/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "claw-rag-service" +version.workspace = true +edition.workspace = true +license.workspace = true +publish.workspace = true +description = "Workspace RAG service: SQLite index, OpenAI-compatible embeddings, query API." + +[dependencies] +axum = "0.8" +clap = { version = "4", features = ["derive", "env"] } +dotenvy = "0.15" +reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] } +rusqlite = { version = "0.32", features = ["bundled"] } +serde = { version = "1", features = ["derive"] } +serde_json.workspace = true +tokio = { version = "1", features = ["macros", "net", "rt-multi-thread", "signal"] } +walkdir = "2" +qdrant-client = { version = "1.17", optional = true } +blake3 = "1" + +[dev-dependencies] +tempfile = "3" + +[features] +default = [] +qdrant-index = ["dep:qdrant-client"] + +[lints] +workspace = true diff --git a/rust/crates/claw-rag-service/Dockerfile b/rust/crates/claw-rag-service/Dockerfile new file mode 100644 index 00000000..2027ed4c --- /dev/null +++ b/rust/crates/claw-rag-service/Dockerfile @@ -0,0 +1,20 @@ +# qdrant-client currently requires a fairly recent stable Rust. +# Keep this pinned to avoid surprise breaks from `rust:latest`. +FROM rust:1.91-bookworm AS builder + +WORKDIR /repo +COPY . /repo/rust/ + +WORKDIR /repo/rust +# Sanity check toolchain version (helps debug CI/Docker Desktop issues). +RUN rustc --version && cargo --version +# Build the service with qdrant support enabled (works even if you don't use qdrant). +RUN cargo build -p claw-rag-service --release --features qdrant-index + +FROM debian:bookworm-slim + +WORKDIR /app +COPY --from=builder /repo/rust/target/release/claw-rag-service /app/claw-rag-service + +EXPOSE 8787 +ENTRYPOINT ["/app/claw-rag-service"] diff --git a/rust/crates/claw-rag-service/src/chunk.rs b/rust/crates/claw-rag-service/src/chunk.rs new file mode 100644 index 00000000..db854c35 --- /dev/null +++ b/rust/crates/claw-rag-service/src/chunk.rs @@ -0,0 +1,41 @@ +//! Split file text into overlapping windows (character-based UTF-8). + +#[must_use] +pub fn chunk_text(text: &str, max_chars: usize, overlap: usize) -> Vec { + if max_chars == 0 { + return Vec::new(); + } + let overlap = overlap.min(max_chars.saturating_sub(1)); + let mut out = Vec::new(); + let chars: Vec = text.chars().collect(); + if chars.is_empty() { + return out; + } + let mut start = 0; + loop { + let end = (start + max_chars).min(chars.len()); + let piece: String = chars[start..end].iter().collect(); + if !piece.trim().is_empty() { + out.push(piece); + } + if end >= chars.len() { + break; + } + let step = max_chars.saturating_sub(overlap).max(1); + start += step; + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn chunks_non_empty() { + let c = chunk_text("hello world test", 5, 2); + assert!(!c.is_empty()); + let joined: String = c.join(""); + assert!(joined.contains("hello")); + } +} diff --git a/rust/crates/claw-rag-service/src/db.rs b/rust/crates/claw-rag-service/src/db.rs new file mode 100644 index 00000000..8b73ec74 --- /dev/null +++ b/rust/crates/claw-rag-service/src/db.rs @@ -0,0 +1,210 @@ +//! `SQLite` storage for chunks and embedding vectors. + +use std::path::Path; + +use rusqlite::{params, Connection}; + +const SCHEMA: &str = r" +CREATE TABLE IF NOT EXISTS chunks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + path TEXT NOT NULL, + ordinal INTEGER NOT NULL, + text TEXT NOT NULL, + UNIQUE(path, ordinal) +); +CREATE TABLE IF NOT EXISTS embeddings ( + chunk_id INTEGER PRIMARY KEY, + dim INTEGER NOT NULL, + vec BLOB NOT NULL, + FOREIGN KEY (chunk_id) REFERENCES chunks(id) ON DELETE CASCADE +); +CREATE TABLE IF NOT EXISTS files ( + path TEXT PRIMARY KEY, + content_hash TEXT NOT NULL, + size_bytes INTEGER NOT NULL, + mtime_ms INTEGER NOT NULL, + indexed_at_ms INTEGER NOT NULL +); +CREATE INDEX IF NOT EXISTS idx_chunks_path ON chunks(path); +"; + +pub fn open_db(path: &Path) -> Result { + if let Some(parent) = path.parent() { + if !parent.as_os_str().is_empty() { + std::fs::create_dir_all(parent).map_err(|e| e.to_string())?; + } + } + + let conn = Connection::open(path).map_err(|e| e.to_string())?; + conn.execute_batch( + r" +PRAGMA foreign_keys = ON; +PRAGMA journal_mode = WAL; +", + ) + .map_err(|e| e.to_string())?; + conn.execute_batch(SCHEMA).map_err(|e| e.to_string())?; + + Ok(conn) +} + +#[allow(dead_code)] +pub fn truncate_index(conn: &Connection) -> Result<(), String> { + conn.execute_batch("DELETE FROM embeddings; DELETE FROM chunks; DELETE FROM files;") + .map_err(|e| e.to_string())?; + Ok(()) +} + +pub fn file_is_unchanged( + conn: &Connection, + path: &str, + content_hash: &str, + size_bytes: i64, + mtime_ms: i64, +) -> Result { + let mut stmt = conn + .prepare("SELECT content_hash, size_bytes, mtime_ms FROM files WHERE path=?1 LIMIT 1") + .map_err(|e| e.to_string())?; + let mut rows = stmt.query(params![path]).map_err(|e| e.to_string())?; + if let Some(r) = rows.next().map_err(|e| e.to_string())? { + let h: String = r.get(0).map_err(|e| e.to_string())?; + let sz: i64 = r.get(1).map_err(|e| e.to_string())?; + let mt: i64 = r.get(2).map_err(|e| e.to_string())?; + return Ok(h == content_hash && sz == size_bytes && mt == mtime_ms); + } + Ok(false) +} + +pub fn upsert_file_meta( + conn: &Connection, + path: &str, + content_hash: &str, + size_bytes: i64, + mtime_ms: i64, + indexed_at_ms: i64, +) -> Result<(), String> { + conn.execute( + r" +INSERT INTO files(path, content_hash, size_bytes, mtime_ms, indexed_at_ms) +VALUES (?1, ?2, ?3, ?4, ?5) +ON CONFLICT(path) DO UPDATE SET + content_hash=excluded.content_hash, + size_bytes=excluded.size_bytes, + mtime_ms=excluded.mtime_ms, + indexed_at_ms=excluded.indexed_at_ms +", + params![path, content_hash, size_bytes, mtime_ms, indexed_at_ms], + ) + .map_err(|e| e.to_string())?; + Ok(()) +} + +pub fn delete_file_and_chunks(conn: &Connection, path: &str) -> Result<(), String> { + // Delete chunks first (embeddings cascade); then remove file meta. + conn.execute("DELETE FROM chunks WHERE path=?1", params![path]) + .map_err(|e| e.to_string())?; + conn.execute("DELETE FROM files WHERE path=?1", params![path]) + .map_err(|e| e.to_string())?; + Ok(()) +} + +pub fn list_all_files(conn: &Connection) -> Result, String> { + let mut stmt = conn + .prepare("SELECT path FROM files") + .map_err(|e| e.to_string())?; + let rows = stmt + .query_map([], |r| r.get::<_, String>(0)) + .map_err(|e| e.to_string())?; + let mut out = Vec::new(); + for r in rows { + out.push(r.map_err(|e| e.to_string())?); + } + Ok(out) +} + +pub fn insert_chunk( + conn: &Connection, + path: &str, + ordinal: i32, + text: &str, +) -> Result { + conn.execute( + "INSERT INTO chunks (path, ordinal, text) VALUES (?1, ?2, ?3)", + params![path, ordinal, text], + ) + .map_err(|e| e.to_string())?; + Ok(conn.last_insert_rowid()) +} + +pub fn insert_embedding( + conn: &Connection, + chunk_id: i64, + dim: usize, + vec: &[f32], +) -> Result<(), String> { + let bytes = f32_slice_to_blob(vec); + let dim_i64 = i64::try_from(dim).map_err(|_| "embedding dim too large".to_string())?; + conn.execute( + "INSERT INTO embeddings (chunk_id, dim, vec) VALUES (?1, ?2, ?3)", + params![chunk_id, dim_i64, bytes], + ) + .map_err(|e| e.to_string())?; + + Ok(()) +} + +pub(crate) fn f32_slice_to_blob(v: &[f32]) -> Vec { + let mut b = Vec::with_capacity(v.len() * 4); + for x in v { + b.extend_from_slice(&x.to_le_bytes()); + } + b +} + +pub fn blob_to_f32_vec(blob: &[u8], dim: usize) -> Option> { + if blob.len() != dim * 4 { + return None; + } + let mut v = Vec::with_capacity(dim); + for chunk in blob.chunks_exact(4) { + v.push(f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])); + } + Some(v) +} + +#[derive(Debug, Clone)] +pub struct ChunkRow { + pub path: String, + pub text: String, + pub vec: Vec, +} + +pub fn load_all_indexed(conn: &Connection) -> Result, String> { + let mut stmt = conn + .prepare( + "SELECT c.path, c.text, e.dim, e.vec FROM chunks c + INNER JOIN embeddings e ON e.chunk_id = c.id", + ) + .map_err(|e| e.to_string())?; + let mut rows = stmt.query([]).map_err(|e| e.to_string())?; + let mut out = Vec::new(); + while let Some(r) = rows.next().map_err(|e| e.to_string())? { + let path: String = r.get(0).map_err(|e| e.to_string())?; + let text: String = r.get(1).map_err(|e| e.to_string())?; + let dim: i64 = r.get(2).map_err(|e| e.to_string())?; + let blob: Vec = r.get(3).map_err(|e| e.to_string())?; + let dim = usize::try_from(dim).map_err(|_| "invalid embedding dim in db".to_string())?; + let Some(vec) = blob_to_f32_vec(&blob, dim) else { + continue; + }; + out.push(ChunkRow { path, text, vec }); + } + Ok(out) +} + +pub fn chunk_count(conn: &Connection) -> Result { + let n: i64 = conn + .query_row("SELECT COUNT(*) FROM chunks", [], |r| r.get(0)) + .map_err(|e| e.to_string())?; + Ok(n) +} diff --git a/rust/crates/claw-rag-service/src/embed.rs b/rust/crates/claw-rag-service/src/embed.rs new file mode 100644 index 00000000..27d69054 --- /dev/null +++ b/rust/crates/claw-rag-service/src/embed.rs @@ -0,0 +1,129 @@ +//! OpenAI-compatible embeddings HTTP client. + +use reqwest::Client; +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug)] +pub struct EmbedConfig { + pub api_key: String, + pub base_url: String, + pub model: String, +} + +impl EmbedConfig { + pub fn from_env() -> Result { + let api_key = std::env::var("CLAW_RAG_OPENAI_API_KEY") + .or_else(|_| std::env::var("OPENAI_API_KEY")) + .map_err(|_| { + "set CLAW_RAG_OPENAI_API_KEY or OPENAI_API_KEY for embeddings".to_string() + })?; + let base_url = std::env::var("CLAW_RAG_EMBEDDING_BASE_URL") + .unwrap_or_else(|_| "https://api.openai.com/v1".into()); + let model = std::env::var("CLAW_RAG_EMBEDDING_MODEL") + .unwrap_or_else(|_| "text-embedding-3-small".into()); + Ok(Self { + api_key, + base_url: base_url.trim_end_matches('/').to_string(), + model, + }) + } + + /// Deterministic fake vectors for tests / dry-run (1536 dims match common `OpenAI` models; + /// truncated scan still works if dim mismatches — ingest uses same mock for all). + #[must_use] + pub fn mock_from_env() -> Option { + if std::env::var("CLAW_RAG_MOCK_PROVIDERS").ok().as_deref() != Some("1") { + return None; + } + Some(Self { + api_key: "mock".into(), + base_url: "mock://".into(), + model: "mock-embedding".into(), + }) + } +} + +#[derive(Serialize)] +struct EmbeddingsRequest<'a> { + model: &'a str, + input: Vec<&'a str>, +} + +#[derive(Deserialize)] +struct EmbeddingsResponse { + data: Vec, +} + +#[derive(Deserialize)] +struct EmbeddingItem { + embedding: Vec, +} + +pub async fn embed_batch( + client: &Client, + cfg: &EmbedConfig, + texts: &[String], +) -> Result>, String> { + if cfg.base_url.starts_with("mock://") { + return Ok(texts + .iter() + .map(|s| mock_vector_for_text(s.as_str())) + .collect()); + } + + let url = format!("{}/embeddings", cfg.base_url); + let inputs: Vec<&str> = texts.iter().map(String::as_str).collect(); + let body = EmbeddingsRequest { + model: &cfg.model, + input: inputs, + }; + let res = client + .post(&url) + .header("Authorization", format!("Bearer {}", cfg.api_key)) + .header("Content-Type", "application/json") + .json(&body) + .send() + .await + .map_err(|e| e.to_string())?; + if !res.status().is_success() { + let t = res.text().await.unwrap_or_default(); + return Err(format!("embeddings HTTP error: {t}")); + } + let parsed: EmbeddingsResponse = res.json().await.map_err(|e| e.to_string())?; + if parsed.data.len() != texts.len() { + return Err(format!( + "embeddings count mismatch: got {} for {} inputs", + parsed.data.len(), + texts.len() + )); + } + Ok(parsed.data.into_iter().map(|d| d.embedding).collect()) +} + +fn mock_vector_for_text(s: &str) -> Vec { + const DIM: usize = 16; + let mut v = vec![0f32; DIM]; + for (i, b) in s.bytes().enumerate().take(DIM * 4) { + v[i % DIM] += f32::from(b) / 255.0; + } + let norm: f32 = v.iter().map(|x| x * x).sum::().sqrt(); + if norm > 0.0 { + for x in &mut v { + *x /= norm; + } + } + v +} + +pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 { + if a.len() != b.len() || a.is_empty() { + return 0.0; + } + let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum(); + let na: f32 = a.iter().map(|x| x * x).sum::().sqrt(); + let nb: f32 = b.iter().map(|x| x * x).sum::().sqrt(); + if na == 0.0 || nb == 0.0 { + return 0.0; + } + dot / (na * nb) +} diff --git a/rust/crates/claw-rag-service/src/ingest.rs b/rust/crates/claw-rag-service/src/ingest.rs new file mode 100644 index 00000000..eae43629 --- /dev/null +++ b/rust/crates/claw-rag-service/src/ingest.rs @@ -0,0 +1,219 @@ +//! Walk workspace and fill `SQLite` + embeddings. + +use std::path::Path; +use std::path::PathBuf; + +use reqwest::Client; +use walkdir::WalkDir; + +use crate::chunk::chunk_text; +use crate::db::{ + delete_file_and_chunks, file_is_unchanged, insert_chunk, insert_embedding, list_all_files, + open_db, upsert_file_meta, +}; +use crate::embed::{embed_batch, EmbedConfig}; +#[cfg(feature = "qdrant-index")] +use crate::qdrant_index::{upsert_points, ChunkPoint}; + +const DEFAULT_MAX_FILE_BYTES: u64 = 2 * 1024 * 1024; +const CHUNK_CHARS: usize = 900; +const CHUNK_OVERLAP: usize = 120; +const EMBED_BATCH: usize = 16; + +static SKIP_DIR_NAMES: &[&str] = &[".git", "target", "node_modules", "__pycache__", ".claw-rag"]; + +static TEXT_EXTENSIONS: &[&str] = &[ + "rs", "md", "toml", "txt", "json", "yaml", "yml", "js", "ts", "tsx", "jsx", "py", "go", "c", + "h", "cpp", "hpp", "cs", "java", "kt", "swift", "rb", "php", "sh", "ps1", "html", "css", "sql", +]; + +#[derive(Debug, Default)] +pub struct IngestStats { + pub files_indexed: usize, + pub chunks_total: usize, + pub embeddings_written: usize, +} + +fn should_skip_dir(path: &Path) -> bool { + path.file_name() + .and_then(std::ffi::OsStr::to_str) + .is_some_and(|n| SKIP_DIR_NAMES.contains(&n)) +} + +fn is_text_extension(path: &Path) -> bool { + path.extension() + .and_then(std::ffi::OsStr::to_str) + .is_some_and(|e| TEXT_EXTENSIONS.contains(&e.to_ascii_lowercase().as_str())) +} + +async fn flush_path_batch( + conn: &rusqlite::Connection, + path: &str, + batch: &mut Vec<(i32, String)>, + client: &Client, + cfg: &EmbedConfig, + stats: &mut IngestStats, +) -> Result<(), String> { + if batch.is_empty() { + return Ok(()); + } + let texts: Vec = batch.iter().map(|(_, t)| t.clone()).collect(); + let vecs = embed_batch(client, cfg, &texts).await?; + if vecs.len() != batch.len() { + return Err("embed batch size mismatch".into()); + } + + #[cfg(feature = "qdrant-index")] + let mut qdrant_points: Vec = Vec::with_capacity(batch.len()); + + for ((ord, t), vec) in batch.drain(..).zip(vecs.into_iter()) { + let dim = vec.len(); + let cid = insert_chunk(conn, path, ord, &t)?; + insert_embedding(conn, cid, dim, &vec)?; + stats.embeddings_written += 1; + + #[cfg(feature = "qdrant-index")] + { + qdrant_points.push(ChunkPoint { + id: cid, + vec, + path: path.to_string(), + text: t, + }); + } + } + + #[cfg(feature = "qdrant-index")] + upsert_points(qdrant_points).await?; + + Ok(()) +} + +pub async fn run_ingest( + workspaces: &[PathBuf], + db_path: &Path, + cfg: &EmbedConfig, + client: &Client, +) -> Result { + let conn = open_db(db_path)?; + + let mut all_files: Vec<(String, PathBuf)> = Vec::new(); + let mut seen_paths: Vec = Vec::new(); + + for ws in workspaces { + let workspace = ws + .canonicalize() + .map_err(|e| format!("workspace: {}: {e}", ws.display()))?; + let ws_prefix = workspace.clone(); + let repo_id = repo_id_for_workspace(&workspace); + + for entry in WalkDir::new(&workspace) + .into_iter() + .filter_entry(|e| !should_skip_dir(e.path())) + { + let entry = entry.map_err(|e| e.to_string())?; + if !entry.file_type().is_file() { + continue; + } + let path = entry.path(); + if !is_text_extension(path) { + continue; + } + let meta = entry.metadata().map_err(|e| e.to_string())?; + if meta.len() > DEFAULT_MAX_FILE_BYTES { + continue; + } + let rel = path + .strip_prefix(&ws_prefix) + .unwrap_or(path) + .to_string_lossy() + .replace('\\', "/"); + let key = format!("{repo_id}:{rel}"); + seen_paths.push(key.clone()); + all_files.push((key, path.to_path_buf())); + } + } + + all_files.sort_by(|a, b| a.0.cmp(&b.0)); + seen_paths.sort(); + + let mut stats = IngestStats { + files_indexed: all_files.len(), + ..Default::default() + }; + + for (rel, file) in all_files { + let Ok(meta) = std::fs::metadata(&file) else { + continue; + }; + let size_bytes = + i64::try_from(meta.len()).map_err(|_| "file size too large".to_string())?; + let mtime_ms = meta + .modified() + .ok() + .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok()) + .and_then(|d| i64::try_from(d.as_millis()).ok()) + .unwrap_or(0); + + let Ok(raw) = std::fs::read_to_string(&file) else { + continue; + }; + + let content_hash = blake3::hash(raw.as_bytes()).to_hex().to_string(); + if file_is_unchanged(&conn, &rel, &content_hash, size_bytes, mtime_ms)? { + continue; + } + + // Re-index this file: delete previous chunks (and embeddings) for path. + delete_file_and_chunks(&conn, &rel)?; + + let pieces = chunk_text(&raw, CHUNK_CHARS, CHUNK_OVERLAP); + if pieces.is_empty() { + continue; + } + + let mut batch: Vec<(i32, String)> = Vec::new(); + for (ord, piece) in pieces.into_iter().enumerate() { + stats.chunks_total += 1; + let ord_i32 = + i32::try_from(ord).map_err(|_| "file produced too many chunks".to_string())?; + batch.push((ord_i32, piece)); + if batch.len() >= EMBED_BATCH { + flush_path_batch(&conn, &rel, &mut batch, client, cfg, &mut stats).await?; + } + } + flush_path_batch(&conn, &rel, &mut batch, client, cfg, &mut stats).await?; + + let now_ms = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| i64::try_from(d.as_millis()).unwrap_or(0)) + .unwrap_or(0); + upsert_file_meta(&conn, &rel, &content_hash, size_bytes, mtime_ms, now_ms)?; + } + + // Delete entries for files that no longer exist. + // (We compare against file list from DB to avoid needing a SQL "NOT IN" temp table.) + let mut seen_set = std::collections::BTreeSet::new(); + for p in &seen_paths { + seen_set.insert(p.as_str()); + } + for p in list_all_files(&conn)? { + if !seen_set.contains(p.as_str()) { + delete_file_and_chunks(&conn, &p)?; + } + } + + Ok(stats) +} + +fn repo_id_for_workspace(workspace: &Path) -> String { + let name = workspace + .file_name() + .and_then(std::ffi::OsStr::to_str) + .filter(|s| !s.is_empty()) + .unwrap_or("workspace"); + let hash = blake3::hash(workspace.to_string_lossy().as_bytes()) + .to_hex() + .to_string(); + format!("{name}-{h}", name = name, h = &hash[..8]) +} diff --git a/rust/crates/claw-rag-service/src/lib.rs b/rust/crates/claw-rag-service/src/lib.rs new file mode 100644 index 00000000..594fc7b9 --- /dev/null +++ b/rust/crates/claw-rag-service/src/lib.rs @@ -0,0 +1,111 @@ +//! Workspace RAG: ingest files → `SQLite` + embeddings, query via cosine similarity (linear scan MVP). +#![forbid(unsafe_code)] + +mod chunk; +mod db; +mod embed; +mod ingest; +#[cfg(feature = "qdrant-index")] +mod qdrant_index; +mod search; + +pub use db::{chunk_count, open_db}; +pub use embed::EmbedConfig; +pub use ingest::{run_ingest, IngestStats}; +pub use search::query_index; + +use serde::{Deserialize, Serialize}; + +/// One retrieved chunk for the model or UI. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RagHit { + pub path: String, + pub snippet: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub score: Option, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct QueryRequest { + pub query: String, + #[serde(default = "default_top_k")] + pub top_k: u32, +} + +fn default_top_k() -> u32 { + 8 +} + +#[derive(Debug, Clone, Serialize)] +pub struct QueryResponse { + pub hits: Vec, + /// `0-stub` (legacy), `1-sqlite`, `1-sqlite-empty`, `1-sqlite-no-db` + pub phase: &'static str, +} + +#[cfg(test)] +mod tests { + use std::path::Path; + + use reqwest::Client; + use tempfile::tempdir; + + use super::*; + + #[tokio::test] + async fn query_missing_db_reports_phase() { + let client = Client::new(); + let cfg = EmbedConfig { + api_key: "x".into(), + base_url: "mock://".into(), + model: "m".into(), + }; + let r = query_index( + Path::new("/no/such/claw_rag.sqlite"), + &client, + &cfg, + &QueryRequest { + query: "hello".into(), + top_k: 3, + }, + ) + .await + .unwrap(); + assert_eq!(r.phase, "1-sqlite-no-db"); + } + + #[tokio::test] + async fn ingest_and_query_roundtrip_mock() { + std::env::set_var("CLAW_RAG_MOCK_PROVIDERS", "1"); + let dir = tempdir().unwrap(); + let ws1 = dir.path().join("ws1"); + let ws2 = dir.path().join("ws2"); + std::fs::create_dir_all(&ws1).unwrap(); + std::fs::create_dir_all(&ws2).unwrap(); + std::fs::write(ws1.join("note.md"), "hello RAG service test content").unwrap(); + std::fs::write(ws2.join("docs.md"), "secondary repo doc about embeddings").unwrap(); + let db = dir.path().join("idx.sqlite"); + let client = Client::new(); + let cfg = EmbedConfig::mock_from_env().expect("mock"); + let st = run_ingest(&[ws1.clone(), ws2.clone()], &db, &cfg, &client) + .await + .unwrap(); + assert!(st.embeddings_written >= 1); + + let r = query_index( + &db, + &client, + &cfg, + &QueryRequest { + query: "RAG service".into(), + top_k: 4, + }, + ) + .await + .unwrap(); + assert_eq!(r.phase, "1-sqlite"); + assert!(!r.hits.is_empty()); + assert!(r.hits.iter().all(|h| h.path.contains(':'))); + std::env::remove_var("CLAW_RAG_MOCK_PROVIDERS"); + } +} diff --git a/rust/crates/claw-rag-service/src/main.rs b/rust/crates/claw-rag-service/src/main.rs new file mode 100644 index 00000000..8fe898db --- /dev/null +++ b/rust/crates/claw-rag-service/src/main.rs @@ -0,0 +1,175 @@ +//! `claw-rag-service` — HTTP API + `ingest` subcommand. + +use std::path::PathBuf; +use std::sync::Arc; + +use axum::{ + extract::State, + http::StatusCode, + response::Html, + routing::{get, post}, + Json, Router, +}; +use clap::{Parser, Subcommand}; +use claw_rag_service::{ + chunk_count, open_db, query_index, run_ingest, EmbedConfig, QueryRequest, QueryResponse, +}; + +#[derive(Parser)] +#[command( + name = "claw-rag-service", + about = "Workspace RAG index + HTTP query API" +)] +struct Cli { + #[command(subcommand)] + command: Option, +} + +#[derive(Subcommand)] +enum Cmd { + /// Run HTTP server (default when no subcommand). + Serve(ServeArgs), + /// Index a workspace into `SQLite` (calls embedding API). + Ingest(IngestArgs), +} + +#[derive(Parser)] +struct ServeArgs { + #[arg(long, env = "CLAW_RAG_DB", default_value = ".claw-rag/index.sqlite")] + db: PathBuf, +} + +#[derive(Parser)] +struct IngestArgs { + /// Workspace roots to ingest. Repeat `--workspace` to ingest multiple repos (cross-repo RAG). + #[arg(short, long)] + workspace: Vec, + #[arg(long, env = "CLAW_RAG_DB", default_value = ".claw-rag/index.sqlite")] + db: PathBuf, +} + +#[derive(Clone)] +struct AppState { + db_path: PathBuf, + client: reqwest::Client, + cfg: EmbedConfig, +} + +/// Single-page UI for phase 3 (served at `GET /`). +static INDEX_HTML: &str = include_str!(concat!(env!("CARGO_MANIFEST_DIR"), "/static/index.html")); + +async fn ui_index() -> Html<&'static str> { + Html(INDEX_HTML) +} + +fn rag_router(state: Arc) -> Router { + Router::new() + .route("/", get(ui_index)) + .route("/health", get(|| async { "ok" })) + .route("/v1/stats", get(stats)) + .route("/v1/query", post(query)) + .with_state(state) +} + +fn resolve_embed_config() -> Result { + if let Some(c) = EmbedConfig::mock_from_env() { + return Ok(c); + } + EmbedConfig::from_env() +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + // Load `.env` if present (walks up parent directories). + // This is a convenience for local development; CI/production should set real env vars. + let _ = dotenvy::dotenv(); + + let cli = Cli::parse(); + + if let Some(Cmd::Ingest(a)) = cli.command { + let cfg = resolve_embed_config()?; + let client = reqwest::Client::new(); + let st = run_ingest(&a.workspace, &a.db, &cfg, &client).await?; + eprintln!( + "ingest: files={} chunks={} embeddings={}", + st.files_indexed, st.chunks_total, st.embeddings_written + ); + return Ok(()); + } + + let db = if let Some(Cmd::Serve(s)) = cli.command { + s.db + } else { + PathBuf::from( + std::env::var("CLAW_RAG_DB").unwrap_or_else(|_| ".claw-rag/index.sqlite".into()), + ) + }; + + let cfg = resolve_embed_config()?; + let state = Arc::new(AppState { + db_path: db, + client: reqwest::Client::new(), + cfg, + }); + + let app = rag_router(state.clone()); + + let port: u16 = std::env::var("CLAW_RAG_PORT") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(8787); + let host: std::net::IpAddr = std::env::var("CLAW_RAG_HOST") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::LOCALHOST)); + let addr = std::net::SocketAddr::from((host, port)); + eprintln!( + "claw-rag-service db={} listen=http://{addr}", + state.db_path.display() + ); + let listener = tokio::net::TcpListener::bind(addr).await?; + axum::serve(listener, app).await?; + Ok(()) +} + +async fn stats(State(state): State>) -> Result, StatusCode> { + let path = state.db_path.clone(); + if !path.is_file() { + return Ok(Json(serde_json::json!({ + "chunks": 0, + "phase": "1-sqlite-no-db" + }))); + } + let res = tokio::task::spawn_blocking(move || { + let conn = open_db(&path).map_err(|_| ())?; + chunk_count(&conn).map_err(|_| ()) + }) + .await + .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)? + .map_err(|()| StatusCode::INTERNAL_SERVER_ERROR)?; + Ok(Json(serde_json::json!({ + "chunks": res, + "phase": "1-sqlite" + }))) +} + +async fn query( + State(state): State>, + Json(req): Json, +) -> Result, (StatusCode, String)> { + query_index(&state.db_path, &state.client, &state.cfg, &req) + .await + .map(Json) + .map_err(|e| (StatusCode::BAD_REQUEST, e)) +} + +#[cfg(test)] +mod tests { + use super::INDEX_HTML; + + #[test] + fn index_html_wires_api_paths() { + assert!(INDEX_HTML.contains("/v1/stats")); + assert!(INDEX_HTML.contains("/v1/query")); + } +} diff --git a/rust/crates/claw-rag-service/src/qdrant_index.rs b/rust/crates/claw-rag-service/src/qdrant_index.rs new file mode 100644 index 00000000..c997eb4b --- /dev/null +++ b/rust/crates/claw-rag-service/src/qdrant_index.rs @@ -0,0 +1,177 @@ +use crate::{QueryResponse, RagHit}; +use serde_json::json; + +async fn ensure_collection( + client: &qdrant_client::Qdrant, + collection: &str, + dim: usize, +) -> Result<(), String> { + let dim_u64 = u64::try_from(dim).map_err(|_| "embedding dim too large".to_string())?; + + // Try to create the collection; if it already exists, Qdrant will error. + // We treat "already exists" as success to keep ingest idempotent. + let res = client + .create_collection( + qdrant_client::qdrant::CreateCollectionBuilder::new(collection).vectors_config( + qdrant_client::qdrant::VectorParamsBuilder::new( + dim_u64, + qdrant_client::qdrant::Distance::Cosine, + ), + ), + ) + .await; + + match res { + Ok(_) => Ok(()), + Err(e) => { + let msg = e.to_string(); + if msg.contains("already exists") || msg.contains("Already exists") { + Ok(()) + } else { + Err(format!("qdrant create_collection: {e}")) + } + } + } +} + +#[derive(Debug, Clone)] +pub struct QdrantConfig { + pub url: String, + pub api_key: Option, + pub collection: String, +} + +impl QdrantConfig { + pub fn from_env() -> Option { + let url = std::env::var("CLAW_RAG_QDRANT_URL").ok()?; + let collection = std::env::var("CLAW_RAG_QDRANT_COLLECTION") + .ok() + .unwrap_or_else(|| "claw_rag_chunks".to_string()); + let api_key = std::env::var("CLAW_RAG_QDRANT_API_KEY").ok(); + Some(Self { + url, + api_key, + collection, + }) + } +} + +pub async fn query_qdrant(q: &[f32], top_k: u32) -> Result, String> { + let Some(cfg) = QdrantConfig::from_env() else { + return Ok(None); + }; + + let limit = top_k.min(64); + let mut client = qdrant_client::Qdrant::from_url(&cfg.url); + if let Some(key) = &cfg.api_key { + client = client.api_key(key.clone()); + } + let client = client.build().map_err(|e| format!("qdrant client: {e}"))?; + + // If collection doesn't exist yet, treat it as "no results" and fall back. + // (We avoid creating it on query because ingest controls dimension/model.) + if let Err(e) = client.collection_info(&cfg.collection).await { + let msg = e.to_string(); + if msg.contains("doesn't exist") || msg.contains("Not found") { + return Ok(None); + } + return Err(format!("qdrant collection_info: {e}")); + } + + let res = client + .query( + qdrant_client::qdrant::QueryPointsBuilder::new(&cfg.collection) + .query(q.to_vec()) + .limit(u64::from(limit)) + .with_payload(true), + ) + .await + .map_err(|e| format!("qdrant query: {e}"))?; + + let mut hits = Vec::new(); + for p in res.result { + let payload = p.payload; + let path = payload + .get("path") + .and_then(|v| v.as_str()) + .map(ToString::to_string) + .unwrap_or_default(); + let text = payload + .get("text") + .and_then(|v| v.as_str()) + .map(ToString::to_string) + .unwrap_or_default(); + let score = p.score; + if !path.is_empty() { + hits.push(RagHit { + path, + snippet: truncate_snippet(&text, 480), + score: Some(score), + }); + } + } + + Ok(Some(QueryResponse { + hits, + phase: "2-qdrant", + })) +} + +#[derive(Debug, Clone)] +pub struct ChunkPoint { + pub id: i64, + pub vec: Vec, + pub path: String, + pub text: String, +} + +pub async fn upsert_points(points: Vec) -> Result<(), String> { + let Some(cfg) = QdrantConfig::from_env() else { + return Ok(()); + }; + if points.is_empty() { + return Ok(()); + } + + let mut client = qdrant_client::Qdrant::from_url(&cfg.url); + if let Some(key) = &cfg.api_key { + client = client.api_key(key.clone()); + } + let client = client.build().map_err(|e| format!("qdrant client: {e}"))?; + + let dim = points[0].vec.len(); + ensure_collection(&client, &cfg.collection, dim).await?; + + let mut qpoints = Vec::with_capacity(points.len()); + for p in points { + if p.vec.len() != dim { + return Err("qdrant upsert: embedding dimension mismatch within batch".to_string()); + } + let id = u64::try_from(p.id).map_err(|_| "chunk id must be non-negative".to_string())?; + let payload_map = serde_json::Map::from_iter([ + ("path".to_string(), json!(p.path)), + ("text".to_string(), json!(p.text)), + ]); + let payload: qdrant_client::Payload = payload_map.into(); + + qpoints.push(qdrant_client::qdrant::PointStruct::new(id, p.vec, payload)); + } + + client + .upsert_points(qdrant_client::qdrant::UpsertPointsBuilder::new( + &cfg.collection, + qpoints, + )) + .await + .map_err(|e| format!("qdrant upsert: {e}"))?; + + Ok(()) +} + +fn truncate_snippet(s: &str, max_chars: usize) -> String { + let n = s.chars().count(); + if n <= max_chars { + return s.to_string(); + } + s.chars().take(max_chars).collect::() + "…" +} diff --git a/rust/crates/claw-rag-service/src/search.rs b/rust/crates/claw-rag-service/src/search.rs new file mode 100644 index 00000000..c4fe3ca2 --- /dev/null +++ b/rust/crates/claw-rag-service/src/search.rs @@ -0,0 +1,87 @@ +//! Vector search over indexed chunks (linear scan MVP). + +use std::path::Path; + +use reqwest::Client; + +use crate::db::{load_all_indexed, open_db}; +use crate::embed::{cosine_similarity, embed_batch, EmbedConfig}; +use crate::{QueryRequest, QueryResponse, RagHit}; + +pub async fn query_index( + db_path: &Path, + client: &Client, + cfg: &EmbedConfig, + req: &QueryRequest, +) -> Result { + if !db_path.is_file() { + return Ok(QueryResponse { + hits: Vec::new(), + phase: "1-sqlite-no-db", + }); + } + + let conn = open_db(db_path)?; + let qvecs = embed_batch(client, cfg, std::slice::from_ref(&req.query)).await?; + let q = qvecs + .into_iter() + .next() + .ok_or_else(|| "no query embedding".to_string())?; + + #[cfg(feature = "qdrant-index")] + if let Ok(Some(r)) = crate::qdrant_index::query_qdrant(&q, req.top_k).await { + return Ok(r); + } + + let rows = load_all_indexed(&conn)?; + drop(conn); + + if rows.is_empty() { + return Ok(QueryResponse { + hits: Vec::new(), + phase: "1-sqlite-empty", + }); + } + + let expected = rows[0].vec.len(); + if q.len() != expected { + return Err(format!( + "embedding dimension mismatch: index uses dim {} but query embedding has {} (same model/env as ingest required)", + expected, q.len() + )); + } + + let mut scored: Vec<(f32, usize)> = rows + .iter() + .enumerate() + .map(|(i, r)| (cosine_similarity(&q, &r.vec), i)) + .collect(); + scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal)); + + let top = req.top_k.min(64) as usize; + let hits: Vec = scored + .into_iter() + .take(top) + .map(|(score, i)| { + let r = &rows[i]; + RagHit { + path: r.path.clone(), + snippet: truncate_snippet(&r.text, 480), + score: Some(score), + } + }) + .collect(); + + Ok(QueryResponse { + hits, + phase: "1-sqlite", + }) +} + +fn truncate_snippet(s: &str, max_chars: usize) -> String { + let n = s.chars().count(); + if n <= max_chars { + return s.to_string(); + } + s.chars().take(max_chars).collect::() + "…" +} diff --git a/rust/crates/claw-rag-service/static/index.html b/rust/crates/claw-rag-service/static/index.html new file mode 100644 index 00000000..c3516202 --- /dev/null +++ b/rust/crates/claw-rag-service/static/index.html @@ -0,0 +1,233 @@ + + + + + + claw-rag + + + +
+

claw-rag-service

+

Local index · same-origin /v1/* API

+
+
+
+ chunks: + phase: + +
+ +
+
+ + +
+
+
+ + +
+ +
+
+ +
+
+ +
+ Index is read-only here; run claw-rag-service ingest to (re)build. Phase 3 UI — no auth; bind to loopback only in production. +
+
+ + +