Spaces:
Sleeping
Sleeping
VectorMind CI commited on
Commit ·
4523f98
1
Parent(s): 24c394f
deploy: ba672bd from MK23IS092/msrit_clockwork
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .env.example +42 -0
- .gitattributes +0 -35
- Dockerfile +64 -0
- README.md +11 -6
- agents/__init__.py +1 -0
- agents/base_agent.py +159 -0
- agents/ingestion_agent.py +334 -0
- agents/memory_agent.py +144 -0
- agents/message_bus.py +145 -0
- agents/reasoning_agent.py +390 -0
- agents/retraining_agent.py +149 -0
- config.py +132 -0
- data/pipeline_runs/147b5293-7c7b-4c13-85ff-b36a5208d85d/6c07740b-6b6f-4e74-9fec-7d72cee11a1c/pipeline.py +5 -0
- data/pipeline_runs/147b5293-7c7b-4c13-85ff-b36a5208d85d/6c07740b-6b6f-4e74-9fec-7d72cee11a1c/run.log +1 -0
- data/pipeline_runs/1eb5fe5c-8c4b-4478-93fc-7b0502ebc54b/ceba6666-3bc5-42f3-b27a-60b8940793e5/pipeline.py +5 -0
- data/pipeline_runs/1eb5fe5c-8c4b-4478-93fc-7b0502ebc54b/ceba6666-3bc5-42f3-b27a-60b8940793e5/run.log +1 -0
- data/pipeline_runs/494ff8ba-8b86-453a-8b40-750c5e34a634/d384b981-2582-466c-9524-8a101a5ec944/pipeline.py +5 -0
- data/pipeline_runs/494ff8ba-8b86-453a-8b40-750c5e34a634/d384b981-2582-466c-9524-8a101a5ec944/run.log +1 -0
- data/pipeline_runs/4a8279c0-16db-4713-bfca-b83f2b7a4e53/1d806c97-a4b8-4961-b1a3-bf3bf2e36996/pipeline.py +5 -0
- data/pipeline_runs/4a8279c0-16db-4713-bfca-b83f2b7a4e53/1d806c97-a4b8-4961-b1a3-bf3bf2e36996/run.log +1 -0
- data/pipeline_runs/83e14744-9a0a-4ece-829d-e9ad0dba3cbf/bda049c6-7392-4adf-853b-b59d66ade0fe/pipeline.py +5 -0
- data/pipeline_runs/83e14744-9a0a-4ece-829d-e9ad0dba3cbf/bda049c6-7392-4adf-853b-b59d66ade0fe/run.log +1 -0
- data/pipeline_runs/92ceb65d-565f-457c-a099-5f2fff00f093/39e0cfab-b5c4-4e03-ba91-9e3ce56a4482/pipeline.py +4 -0
- data/pipeline_runs/92ceb65d-565f-457c-a099-5f2fff00f093/39e0cfab-b5c4-4e03-ba91-9e3ce56a4482/run.log +1 -0
- data/pipeline_runs/d30f2c12-4c47-40db-ba18-a7edd989515c/68664895-2b01-4fc4-9465-9413515b0d90/pipeline.py +92 -0
- data/pipeline_runs/d30f2c12-4c47-40db-ba18-a7edd989515c/68664895-2b01-4fc4-9465-9413515b0d90/run.log +4 -0
- data/pipeline_runs/eb542fc3-721b-4cdd-ac4c-0c54f62ca512/d255fb9e-b352-47a5-b055-77f1b70709dd/pipeline.py +4 -0
- data/pipeline_runs/eb542fc3-721b-4cdd-ac4c-0c54f62ca512/d255fb9e-b352-47a5-b055-77f1b70709dd/run.log +1 -0
- db/__init__.py +1 -0
- db/database.py +527 -0
- delivery/__init__.py +1 -0
- delivery/api_routes.py +781 -0
- delivery/colab_publisher.py +98 -0
- delivery/telegram_bot.py +398 -0
- embeddings/__init__.py +1 -0
- embeddings/engine.py +154 -0
- embeddings/vector_store.py +295 -0
- ingestion/__init__.py +1 -0
- ingestion/arxiv_crawler.py +122 -0
- ingestion/blog_crawler.py +72 -0
- ingestion/github_crawler.py +161 -0
- ingestion/patent_crawler.py +94 -0
- ingestion/pdf_parser.py +83 -0
- ingestion/schema.py +142 -0
- ingestion/social_crawler.py +66 -0
- ingestion/startup_crawler.py +69 -0
- intelligence/__init__.py +1 -0
- intelligence/blueprint_engine.py +358 -0
- intelligence/experiment_designer.py +68 -0
- intelligence/pipeline_executor.py +240 -0
.env.example
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core
|
| 2 |
+
API_HOST=0.0.0.0
|
| 3 |
+
API_PORT=8000
|
| 4 |
+
|
| 5 |
+
# Model providers
|
| 6 |
+
GEMINI_API_KEY=
|
| 7 |
+
HUGGINGFACE_TOKEN=
|
| 8 |
+
KAGGLE_USERNAME=
|
| 9 |
+
KAGGLE_KEY=
|
| 10 |
+
GITHUB_TOKEN=
|
| 11 |
+
TELEGRAM_BOT_TOKEN=
|
| 12 |
+
# Optional if unset: after you message the bot once, restart the API or run scripts/discover_telegram_chat.py
|
| 13 |
+
TELEGRAM_CHAT_ID=
|
| 14 |
+
|
| 15 |
+
# Ingestion controls
|
| 16 |
+
ENABLE_PATENTS_REAL=true
|
| 17 |
+
ENABLE_STARTUPS_REAL=true
|
| 18 |
+
ENABLE_SOCIAL_REAL=true
|
| 19 |
+
ENABLE_BLOG_REAL=true
|
| 20 |
+
ALLOW_SIMULATED_SOURCES=true
|
| 21 |
+
|
| 22 |
+
# Distributed infra backends
|
| 23 |
+
DB_BACKEND=postgres
|
| 24 |
+
POSTGRES_HOST=localhost
|
| 25 |
+
POSTGRES_PORT=5432
|
| 26 |
+
POSTGRES_DB=vectormind
|
| 27 |
+
POSTGRES_USER=vectormind
|
| 28 |
+
POSTGRES_PASSWORD=vectormind
|
| 29 |
+
POSTGRES_DSN=postgresql://vectormind:vectormind@localhost:5432/vectormind
|
| 30 |
+
|
| 31 |
+
STATE_STORE_BACKEND=redis
|
| 32 |
+
REDIS_URL=redis://localhost:6379/0
|
| 33 |
+
|
| 34 |
+
MESSAGE_BUS_BACKEND=kafka_mirror
|
| 35 |
+
KAFKA_BOOTSTRAP_SERVERS=localhost:9092
|
| 36 |
+
KAFKA_TOPIC_PREFIX=vectormind
|
| 37 |
+
|
| 38 |
+
# Pipeline runtime
|
| 39 |
+
PIPELINE_RUN_TIMEOUT_SECONDS=1800
|
| 40 |
+
PIPELINE_MAX_CONCURRENT_RUNS=2
|
| 41 |
+
PIPELINE_MAX_RETRIES=1
|
| 42 |
+
PIPELINE_RETRY_BACKOFF_SECONDS=5
|
.gitattributes
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dockerfile
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 2 |
+
# VectorMind backend — Hugging Face Spaces (Docker SDK) image.
|
| 3 |
+
#
|
| 4 |
+
# Why this Dockerfile is the way it is:
|
| 5 |
+
# • HF Spaces (Docker SDK) expect the app to listen on port 7860.
|
| 6 |
+
# • HF builds are slow; we install CPU-only torch from the PyTorch CPU index
|
| 7 |
+
# so we don't pull a multi-GB CUDA wheel that we'll never use.
|
| 8 |
+
# • Persistence: HF Spaces give us /data (writable) but the rest of the FS
|
| 9 |
+
# resets on rebuild. We point DATA_DIR there so SQLite + run logs survive
|
| 10 |
+
# restarts even though they're wiped on space restart-with-rebuild.
|
| 11 |
+
# • Everything heavy is opt-in via env vars (Postgres/Redis/Kafka). For the
|
| 12 |
+
# demo the SQLite + in-memory paths are used — no external services
|
| 13 |
+
# required to stand up the demo.
|
| 14 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 15 |
+
|
| 16 |
+
FROM python:3.11-slim AS base
|
| 17 |
+
|
| 18 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 19 |
+
PYTHONUNBUFFERED=1 \
|
| 20 |
+
PIP_NO_CACHE_DIR=1 \
|
| 21 |
+
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
| 22 |
+
HF_HOME=/data/.cache/huggingface \
|
| 23 |
+
TRANSFORMERS_CACHE=/data/.cache/huggingface \
|
| 24 |
+
SENTENCE_TRANSFORMERS_HOME=/data/.cache/huggingface
|
| 25 |
+
|
| 26 |
+
# System deps:
|
| 27 |
+
# • libgomp1 — required by torch (OpenMP runtime)
|
| 28 |
+
# • git/curl — handy for runtime downloads of model weights
|
| 29 |
+
# • build-essential not needed because we install slim binary wheels
|
| 30 |
+
RUN apt-get update \
|
| 31 |
+
&& apt-get install -y --no-install-recommends \
|
| 32 |
+
git curl libgomp1 ca-certificates \
|
| 33 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 34 |
+
|
| 35 |
+
WORKDIR /app
|
| 36 |
+
|
| 37 |
+
# Install CPU-only torch first so the heavy wheel is cached and we don't
|
| 38 |
+
# accidentally pull a 2GB CUDA build via a transitive dep.
|
| 39 |
+
RUN pip install --upgrade pip \
|
| 40 |
+
&& pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cpu
|
| 41 |
+
|
| 42 |
+
COPY requirements.txt .
|
| 43 |
+
RUN pip install -r requirements.txt
|
| 44 |
+
|
| 45 |
+
COPY . /app
|
| 46 |
+
|
| 47 |
+
# HF Spaces conventions:
|
| 48 |
+
# • port 7860
|
| 49 |
+
# • /data is the only persistent writable mount
|
| 50 |
+
ENV API_HOST=0.0.0.0 \
|
| 51 |
+
API_PORT=7860 \
|
| 52 |
+
DB_BACKEND=sqlite \
|
| 53 |
+
STATE_STORE_BACKEND=sqlite \
|
| 54 |
+
MESSAGE_BUS_BACKEND=in_memory \
|
| 55 |
+
USE_MOCK_LLM=false \
|
| 56 |
+
HF_DEPLOYMENT=true
|
| 57 |
+
|
| 58 |
+
# /data is writable on HF; pre-create our subfolders so first-write succeeds.
|
| 59 |
+
RUN mkdir -p /data/cache /data/pipeline_runs \
|
| 60 |
+
&& chmod -R 777 /data
|
| 61 |
+
|
| 62 |
+
EXPOSE 7860
|
| 63 |
+
|
| 64 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,11 +1,16 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
|
|
|
| 7 |
pinned: false
|
| 8 |
-
|
| 9 |
---
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: VectorMind Backend
|
| 3 |
+
emoji: 🧠
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
pinned: false
|
| 9 |
+
short_description: Autonomous AI research intelligence backend
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# VectorMind Backend
|
| 13 |
+
|
| 14 |
+
FastAPI service that powers the VectorMind Android app. Source of
|
| 15 |
+
truth lives in [the GitHub repo](https://github.com/) — this Space
|
| 16 |
+
is auto-deployed on every push to `main`.
|
agents/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# VectorMinds Agents Package
|
agents/base_agent.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Base Agent — Abstract base class for all OpenClaw agents.
|
| 2 |
+
|
| 3 |
+
Provides lifecycle management, event loop, health checking,
|
| 4 |
+
and state checkpointing.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import asyncio
|
| 10 |
+
import logging
|
| 11 |
+
from abc import ABC, abstractmethod
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from typing import Optional
|
| 14 |
+
|
| 15 |
+
import config
|
| 16 |
+
from agents.message_bus import MessageBus
|
| 17 |
+
from ingestion.schema import AgentEvent
|
| 18 |
+
|
| 19 |
+
logger = logging.getLogger("vectormind.agent")
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
import redis.asyncio as redis_async
|
| 23 |
+
except Exception: # pragma: no cover - optional dependency
|
| 24 |
+
redis_async = None
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class BaseAgent(ABC):
|
| 28 |
+
"""Abstract base class for VectorMind agents."""
|
| 29 |
+
|
| 30 |
+
def __init__(self, name: str):
|
| 31 |
+
self.name = name
|
| 32 |
+
self.bus = MessageBus.get_instance()
|
| 33 |
+
self._running = False
|
| 34 |
+
self._task: Optional[asyncio.Task] = None
|
| 35 |
+
self._last_heartbeat = datetime.utcnow()
|
| 36 |
+
self._events_processed = 0
|
| 37 |
+
self._status = "idle"
|
| 38 |
+
self._subscribed_topics: list[str] = []
|
| 39 |
+
self._queues: list[asyncio.Queue] = []
|
| 40 |
+
self._state_store = None
|
| 41 |
+
|
| 42 |
+
@property
|
| 43 |
+
def status(self) -> str:
|
| 44 |
+
return self._status
|
| 45 |
+
|
| 46 |
+
@property
|
| 47 |
+
def is_running(self) -> bool:
|
| 48 |
+
return self._running
|
| 49 |
+
|
| 50 |
+
def get_health(self) -> dict:
|
| 51 |
+
"""Return agent health status."""
|
| 52 |
+
return {
|
| 53 |
+
"name": self.name,
|
| 54 |
+
"status": self._status,
|
| 55 |
+
"running": self._running,
|
| 56 |
+
"events_processed": self._events_processed,
|
| 57 |
+
"last_heartbeat": self._last_heartbeat.isoformat(),
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
def subscribe(self, topic: str):
|
| 61 |
+
"""Subscribe to a message bus topic."""
|
| 62 |
+
queue = self.bus.subscribe(topic)
|
| 63 |
+
self._queues.append(queue)
|
| 64 |
+
self._subscribed_topics.append(topic)
|
| 65 |
+
logger.info(f"Agent '{self.name}' subscribed to '{topic}'")
|
| 66 |
+
|
| 67 |
+
async def publish(self, topic: str, payload: dict):
|
| 68 |
+
"""Publish an event to the message bus."""
|
| 69 |
+
await self.bus.publish_simple(topic, self.name, payload)
|
| 70 |
+
|
| 71 |
+
async def start(self):
|
| 72 |
+
"""Start the agent's event loop."""
|
| 73 |
+
self._running = True
|
| 74 |
+
self._status = "running"
|
| 75 |
+
if config.STATE_STORE_BACKEND == "redis" and redis_async is not None:
|
| 76 |
+
self._state_store = redis_async.from_url(config.REDIS_URL, decode_responses=True)
|
| 77 |
+
self.setup()
|
| 78 |
+
logger.info(f"Agent '{self.name}' started")
|
| 79 |
+
self._task = asyncio.create_task(self._run_loop())
|
| 80 |
+
|
| 81 |
+
async def stop(self):
|
| 82 |
+
"""Stop the agent."""
|
| 83 |
+
self._running = False
|
| 84 |
+
self._status = "stopped"
|
| 85 |
+
if self._task:
|
| 86 |
+
self._task.cancel()
|
| 87 |
+
try:
|
| 88 |
+
await self._task
|
| 89 |
+
except asyncio.CancelledError:
|
| 90 |
+
pass
|
| 91 |
+
if self._state_store is not None:
|
| 92 |
+
await self._state_store.close()
|
| 93 |
+
logger.info(f"Agent '{self.name}' stopped")
|
| 94 |
+
|
| 95 |
+
async def _run_loop(self):
|
| 96 |
+
"""Main event processing loop."""
|
| 97 |
+
while self._running:
|
| 98 |
+
try:
|
| 99 |
+
# Process events from all subscribed queues
|
| 100 |
+
for queue in self._queues:
|
| 101 |
+
try:
|
| 102 |
+
event = queue.get_nowait()
|
| 103 |
+
await self.process_event(event)
|
| 104 |
+
self._events_processed += 1
|
| 105 |
+
self._last_heartbeat = datetime.utcnow()
|
| 106 |
+
await self._checkpoint_state()
|
| 107 |
+
except asyncio.QueueEmpty:
|
| 108 |
+
continue
|
| 109 |
+
|
| 110 |
+
# Run periodic tasks
|
| 111 |
+
await self.periodic_task()
|
| 112 |
+
|
| 113 |
+
# Small sleep to prevent busy-waiting
|
| 114 |
+
await asyncio.sleep(0.1)
|
| 115 |
+
|
| 116 |
+
except asyncio.CancelledError:
|
| 117 |
+
break
|
| 118 |
+
except Exception as e:
|
| 119 |
+
logger.error(f"Agent '{self.name}' error: {e}")
|
| 120 |
+
self._status = "error"
|
| 121 |
+
await asyncio.sleep(1.0)
|
| 122 |
+
self._status = "running"
|
| 123 |
+
|
| 124 |
+
async def _checkpoint_state(self):
|
| 125 |
+
"""Persist light agent heartbeat/status to Redis when enabled."""
|
| 126 |
+
if self._state_store is None:
|
| 127 |
+
return
|
| 128 |
+
try:
|
| 129 |
+
key = f"vectormind:agent:{self.name}:state"
|
| 130 |
+
await self._state_store.hset(
|
| 131 |
+
key,
|
| 132 |
+
mapping={
|
| 133 |
+
"status": self._status,
|
| 134 |
+
"events_processed": str(self._events_processed),
|
| 135 |
+
"last_heartbeat": self._last_heartbeat.isoformat(),
|
| 136 |
+
},
|
| 137 |
+
)
|
| 138 |
+
await self._state_store.expire(key, 3600)
|
| 139 |
+
except Exception as e:
|
| 140 |
+
logger.error(f"Agent '{self.name}' checkpoint failed: {e}")
|
| 141 |
+
|
| 142 |
+
def setup(self):
|
| 143 |
+
"""Optional setup hook called before the event loop starts."""
|
| 144 |
+
pass
|
| 145 |
+
|
| 146 |
+
@abstractmethod
|
| 147 |
+
async def process_event(self, event: AgentEvent):
|
| 148 |
+
"""Process a single event from the message bus.
|
| 149 |
+
|
| 150 |
+
Must be implemented by subclasses.
|
| 151 |
+
"""
|
| 152 |
+
pass
|
| 153 |
+
|
| 154 |
+
async def periodic_task(self):
|
| 155 |
+
"""Optional periodic task that runs each loop iteration.
|
| 156 |
+
|
| 157 |
+
Override in subclasses for scheduled work.
|
| 158 |
+
"""
|
| 159 |
+
pass
|
agents/ingestion_agent.py
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Ingestion Agent — Manages all data source crawlers.
|
| 2 |
+
|
| 3 |
+
Orchestrates arXiv and GitHub crawlers, monitors source health,
|
| 4 |
+
and publishes new research signals to the message bus.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import asyncio
|
| 10 |
+
import logging
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
|
| 13 |
+
from agents.base_agent import BaseAgent
|
| 14 |
+
from embeddings.engine import EmbeddingEngine
|
| 15 |
+
from embeddings.vector_store import VectorStore
|
| 16 |
+
from ingestion.arxiv_crawler import ArxivCrawler
|
| 17 |
+
from ingestion.blog_crawler import BlogCrawler
|
| 18 |
+
from ingestion.github_crawler import GitHubCrawler
|
| 19 |
+
from ingestion.patent_crawler import PatentCrawler
|
| 20 |
+
from ingestion.schema import AgentEvent, ResearchSignal, SignalSource
|
| 21 |
+
from ingestion.social_crawler import SocialCrawler
|
| 22 |
+
from ingestion.startup_crawler import StartupCrawler
|
| 23 |
+
import config
|
| 24 |
+
|
| 25 |
+
logger = logging.getLogger("vectormind.ingestion_agent")
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class IngestionAgent(BaseAgent):
|
| 29 |
+
"""Agent that manages data ingestion from all sources."""
|
| 30 |
+
|
| 31 |
+
def __init__(self):
|
| 32 |
+
super().__init__("IngestionAgent")
|
| 33 |
+
self.arxiv_crawler = ArxivCrawler(
|
| 34 |
+
categories=config.ARXIV_CATEGORIES,
|
| 35 |
+
max_results=config.ARXIV_MAX_RESULTS,
|
| 36 |
+
)
|
| 37 |
+
self.github_crawler = GitHubCrawler(
|
| 38 |
+
languages=config.GITHUB_TRENDING_LANGUAGES,
|
| 39 |
+
max_results=config.GITHUB_MAX_RESULTS,
|
| 40 |
+
token=config.GITHUB_TOKEN,
|
| 41 |
+
)
|
| 42 |
+
self.patent_crawler = PatentCrawler()
|
| 43 |
+
self.startup_crawler = StartupCrawler()
|
| 44 |
+
self.social_crawler = SocialCrawler()
|
| 45 |
+
self.blog_crawler = BlogCrawler()
|
| 46 |
+
self.embedding_engine = EmbeddingEngine.get_instance()
|
| 47 |
+
self.vector_store = VectorStore.get_instance()
|
| 48 |
+
|
| 49 |
+
self._last_ingestion = None
|
| 50 |
+
self._ingestion_count = 0
|
| 51 |
+
self._source_health = {
|
| 52 |
+
"arxiv": {"status": "healthy", "last_success": None, "failures": 0},
|
| 53 |
+
"github": {"status": "healthy", "last_success": None, "failures": 0},
|
| 54 |
+
"patents": {"status": "healthy", "last_success": None, "failures": 0},
|
| 55 |
+
"startups": {"status": "healthy", "last_success": None, "failures": 0},
|
| 56 |
+
"social": {"status": "healthy", "last_success": None, "failures": 0},
|
| 57 |
+
"blog": {"status": "healthy", "last_success": None, "failures": 0},
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
def setup(self):
|
| 61 |
+
self.subscribe("ingestion.trigger")
|
| 62 |
+
|
| 63 |
+
async def process_event(self, event: AgentEvent):
|
| 64 |
+
"""Handle ingestion trigger events."""
|
| 65 |
+
if event.topic == "ingestion.trigger":
|
| 66 |
+
category = event.payload.get("category")
|
| 67 |
+
source = event.payload.get("source", "all")
|
| 68 |
+
await self.run_ingestion(source=source, category=category)
|
| 69 |
+
|
| 70 |
+
async def run_ingestion(
|
| 71 |
+
self,
|
| 72 |
+
source: str = "all",
|
| 73 |
+
category: str = None,
|
| 74 |
+
) -> list[ResearchSignal]:
|
| 75 |
+
"""Run a full ingestion cycle.
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
source: 'arxiv', 'github', or 'all'
|
| 79 |
+
category: Optional arXiv category filter
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
List of new research signals ingested
|
| 83 |
+
"""
|
| 84 |
+
self._status = "ingesting"
|
| 85 |
+
all_signals = []
|
| 86 |
+
|
| 87 |
+
# Fetch from arXiv
|
| 88 |
+
if source in ("all", "arxiv"):
|
| 89 |
+
try:
|
| 90 |
+
arxiv_signals = await self.arxiv_crawler.fetch_recent_papers(
|
| 91 |
+
category=category
|
| 92 |
+
)
|
| 93 |
+
all_signals.extend(arxiv_signals)
|
| 94 |
+
self._source_health["arxiv"]["status"] = "healthy"
|
| 95 |
+
self._source_health["arxiv"]["last_success"] = (
|
| 96 |
+
datetime.utcnow().isoformat()
|
| 97 |
+
)
|
| 98 |
+
self._source_health["arxiv"]["failures"] = 0
|
| 99 |
+
except Exception as e:
|
| 100 |
+
logger.error(f"arXiv ingestion failed: {e}")
|
| 101 |
+
self._source_health["arxiv"]["failures"] += 1
|
| 102 |
+
if self._source_health["arxiv"]["failures"] >= 3:
|
| 103 |
+
self._source_health["arxiv"]["status"] = "unhealthy"
|
| 104 |
+
|
| 105 |
+
# Fetch from GitHub
|
| 106 |
+
if source in ("all", "github"):
|
| 107 |
+
try:
|
| 108 |
+
for topic in ["machine-learning", "deep-learning", "transformers"]:
|
| 109 |
+
github_signals = await self.github_crawler.fetch_trending_repos(
|
| 110 |
+
topic=topic
|
| 111 |
+
)
|
| 112 |
+
all_signals.extend(github_signals)
|
| 113 |
+
self._source_health["github"]["status"] = "healthy"
|
| 114 |
+
self._source_health["github"]["last_success"] = (
|
| 115 |
+
datetime.utcnow().isoformat()
|
| 116 |
+
)
|
| 117 |
+
self._source_health["github"]["failures"] = 0
|
| 118 |
+
except Exception as e:
|
| 119 |
+
logger.error(f"GitHub ingestion failed: {e}")
|
| 120 |
+
self._source_health["github"]["failures"] += 1
|
| 121 |
+
if self._source_health["github"]["failures"] >= 3:
|
| 122 |
+
self._source_health["github"]["status"] = "unhealthy"
|
| 123 |
+
# Fetch from Patents
|
| 124 |
+
if source in ("all", "patents"):
|
| 125 |
+
try:
|
| 126 |
+
patent_signals = []
|
| 127 |
+
if config.ENABLE_PATENTS_REAL:
|
| 128 |
+
patent_signals = await self.patent_crawler.fetch_recent_patents(
|
| 129 |
+
max_results=config.PATENTS_MAX_RESULTS
|
| 130 |
+
)
|
| 131 |
+
if not patent_signals and config.ALLOW_SIMULATED_SOURCES:
|
| 132 |
+
patent_signals = [
|
| 133 |
+
ResearchSignal(
|
| 134 |
+
source=SignalSource.PATENT,
|
| 135 |
+
source_id="US-2026-0012345",
|
| 136 |
+
title="Distributed Multi-Agent Reasoning via Sparse Attention Meshes",
|
| 137 |
+
authors=["VectorMind R&D"],
|
| 138 |
+
raw_text="A method and system for optimizing multi-agent reasoning in decentralized networks...",
|
| 139 |
+
url="https://patents.google.com/patent/US20260012345A1",
|
| 140 |
+
metadata={
|
| 141 |
+
"patent_number": "US20260012345",
|
| 142 |
+
"assignee": "Samsung R&D",
|
| 143 |
+
"simulated": True,
|
| 144 |
+
},
|
| 145 |
+
)
|
| 146 |
+
]
|
| 147 |
+
all_signals.extend(patent_signals)
|
| 148 |
+
self._source_health["patents"]["status"] = "healthy"
|
| 149 |
+
self._source_health["patents"]["last_success"] = datetime.utcnow().isoformat()
|
| 150 |
+
except Exception as e:
|
| 151 |
+
logger.error(f"Patent ingestion failed: {e}")
|
| 152 |
+
self._source_health["patents"]["status"] = "unhealthy"
|
| 153 |
+
if config.ALLOW_SIMULATED_SOURCES:
|
| 154 |
+
all_signals.extend(self._simulated_patent_signals())
|
| 155 |
+
|
| 156 |
+
# Fetch from Startup Ecosystem
|
| 157 |
+
if source in ("all", "startups"):
|
| 158 |
+
try:
|
| 159 |
+
startup_signals = []
|
| 160 |
+
if config.ENABLE_STARTUPS_REAL:
|
| 161 |
+
startup_signals = await self.startup_crawler.fetch_startup_signals(
|
| 162 |
+
max_results=config.STARTUPS_MAX_RESULTS
|
| 163 |
+
)
|
| 164 |
+
if not startup_signals and config.ALLOW_SIMULATED_SOURCES:
|
| 165 |
+
startup_signals = [
|
| 166 |
+
ResearchSignal(
|
| 167 |
+
source=SignalSource.STARTUP,
|
| 168 |
+
source_id="YC-W26-VECT",
|
| 169 |
+
title="Seed Funding: NeuroForge AI (YC W26)",
|
| 170 |
+
authors=["YC"],
|
| 171 |
+
raw_text="NeuroForge AI raises $5M to commercialize sparse attention architectures.",
|
| 172 |
+
url="https://ycombinator.com/companies/neuroforge",
|
| 173 |
+
metadata={"funding_round": "Seed", "amount": "$5M", "simulated": True},
|
| 174 |
+
)
|
| 175 |
+
]
|
| 176 |
+
all_signals.extend(startup_signals)
|
| 177 |
+
self._source_health["startups"]["status"] = "healthy"
|
| 178 |
+
self._source_health["startups"]["last_success"] = datetime.utcnow().isoformat()
|
| 179 |
+
except Exception:
|
| 180 |
+
self._source_health["startups"]["status"] = "unhealthy"
|
| 181 |
+
if config.ALLOW_SIMULATED_SOURCES:
|
| 182 |
+
all_signals.extend(self._simulated_startup_signals())
|
| 183 |
+
|
| 184 |
+
# Fetch from Social (Hacker News)
|
| 185 |
+
if source in ("all", "social"):
|
| 186 |
+
try:
|
| 187 |
+
social_signals = []
|
| 188 |
+
if config.ENABLE_SOCIAL_REAL:
|
| 189 |
+
social_signals = await self.social_crawler.fetch_hn_signals(
|
| 190 |
+
max_results=config.SOCIAL_MAX_RESULTS
|
| 191 |
+
)
|
| 192 |
+
if not social_signals and config.ALLOW_SIMULATED_SOURCES:
|
| 193 |
+
social_signals = [
|
| 194 |
+
ResearchSignal(
|
| 195 |
+
source=SignalSource.SOCIAL,
|
| 196 |
+
source_id="HN-4123456",
|
| 197 |
+
title="Show HN: VectorMind - Open Source Research Intelligence",
|
| 198 |
+
authors=["hn_user"],
|
| 199 |
+
raw_text="The first agentic platform for autonomous research...",
|
| 200 |
+
url="https://news.ycombinator.com/item?id=4123456",
|
| 201 |
+
metadata={"upvotes": 450, "comments": 82, "simulated": True},
|
| 202 |
+
)
|
| 203 |
+
]
|
| 204 |
+
all_signals.extend(social_signals)
|
| 205 |
+
self._source_health["social"]["status"] = "healthy"
|
| 206 |
+
self._source_health["social"]["last_success"] = datetime.utcnow().isoformat()
|
| 207 |
+
except Exception:
|
| 208 |
+
self._source_health["social"]["status"] = "unhealthy"
|
| 209 |
+
if config.ALLOW_SIMULATED_SOURCES:
|
| 210 |
+
all_signals.extend(self._simulated_social_signals())
|
| 211 |
+
|
| 212 |
+
# Fetch from Blogs (labs + ecosystem)
|
| 213 |
+
if source in ("all", "blog"):
|
| 214 |
+
try:
|
| 215 |
+
blog_signals = []
|
| 216 |
+
if config.ENABLE_BLOG_REAL:
|
| 217 |
+
blog_signals = await self.blog_crawler.fetch_blog_signals(
|
| 218 |
+
max_results=config.BLOG_MAX_RESULTS
|
| 219 |
+
)
|
| 220 |
+
all_signals.extend(blog_signals)
|
| 221 |
+
self._source_health["blog"]["status"] = "healthy"
|
| 222 |
+
self._source_health["blog"]["last_success"] = datetime.utcnow().isoformat()
|
| 223 |
+
except Exception:
|
| 224 |
+
self._source_health["blog"]["status"] = "unhealthy"
|
| 225 |
+
|
| 226 |
+
# Embed all signals
|
| 227 |
+
if all_signals:
|
| 228 |
+
texts = [
|
| 229 |
+
f"{s.title}. {s.raw_text}" for s in all_signals
|
| 230 |
+
]
|
| 231 |
+
embeddings = self.embedding_engine.embed_batch(texts)
|
| 232 |
+
|
| 233 |
+
for signal, embedding in zip(all_signals, embeddings):
|
| 234 |
+
signal.embedding = embedding
|
| 235 |
+
|
| 236 |
+
# Compute novelty score
|
| 237 |
+
signal.novelty_score = self.vector_store.compute_novelty_score(
|
| 238 |
+
embedding
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
# Store in vector store
|
| 242 |
+
self.vector_store.upsert_signal(
|
| 243 |
+
signal_id=signal.id,
|
| 244 |
+
embedding=embedding,
|
| 245 |
+
payload={
|
| 246 |
+
"id": signal.id,
|
| 247 |
+
"source": signal.source.value,
|
| 248 |
+
"source_id": signal.source_id,
|
| 249 |
+
"title": signal.title,
|
| 250 |
+
"raw_text": signal.raw_text[:500],
|
| 251 |
+
"authors": signal.authors[:5],
|
| 252 |
+
"categories": signal.categories,
|
| 253 |
+
"url": signal.url,
|
| 254 |
+
"novelty_score": signal.novelty_score,
|
| 255 |
+
"timestamp": signal.timestamp.isoformat(),
|
| 256 |
+
"metadata": signal.metadata,
|
| 257 |
+
},
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
# Publish event for each new signal
|
| 261 |
+
await self.publish(
|
| 262 |
+
"ingestion.new_signal",
|
| 263 |
+
{
|
| 264 |
+
"signal_id": signal.id,
|
| 265 |
+
"source": signal.source.value,
|
| 266 |
+
"title": signal.title,
|
| 267 |
+
"novelty_score": signal.novelty_score,
|
| 268 |
+
},
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
self._ingestion_count += len(all_signals)
|
| 272 |
+
self._last_ingestion = datetime.utcnow()
|
| 273 |
+
self._status = "running"
|
| 274 |
+
|
| 275 |
+
logger.info(
|
| 276 |
+
f"Ingestion complete: {len(all_signals)} signals "
|
| 277 |
+
f"(total: {self._ingestion_count})"
|
| 278 |
+
)
|
| 279 |
+
|
| 280 |
+
return all_signals
|
| 281 |
+
|
| 282 |
+
def get_health(self) -> dict:
|
| 283 |
+
health = super().get_health()
|
| 284 |
+
health.update({
|
| 285 |
+
"source_health": self._source_health,
|
| 286 |
+
"total_ingested": self._ingestion_count,
|
| 287 |
+
"last_ingestion": (
|
| 288 |
+
self._last_ingestion.isoformat() if self._last_ingestion else None
|
| 289 |
+
),
|
| 290 |
+
})
|
| 291 |
+
return health
|
| 292 |
+
|
| 293 |
+
def _simulated_patent_signals(self) -> list[ResearchSignal]:
|
| 294 |
+
return [
|
| 295 |
+
ResearchSignal(
|
| 296 |
+
source=SignalSource.PATENT,
|
| 297 |
+
source_id="US-2026-0012345",
|
| 298 |
+
title="Distributed Multi-Agent Reasoning via Sparse Attention Meshes",
|
| 299 |
+
authors=["VectorMind R&D"],
|
| 300 |
+
raw_text="A method and system for optimizing multi-agent reasoning in decentralized networks...",
|
| 301 |
+
url="https://patents.google.com/patent/US20260012345A1",
|
| 302 |
+
metadata={
|
| 303 |
+
"patent_number": "US20260012345",
|
| 304 |
+
"assignee": "Samsung R&D",
|
| 305 |
+
"simulated": True,
|
| 306 |
+
},
|
| 307 |
+
)
|
| 308 |
+
]
|
| 309 |
+
|
| 310 |
+
def _simulated_startup_signals(self) -> list[ResearchSignal]:
|
| 311 |
+
return [
|
| 312 |
+
ResearchSignal(
|
| 313 |
+
source=SignalSource.STARTUP,
|
| 314 |
+
source_id="YC-W26-VECT",
|
| 315 |
+
title="Seed Funding: NeuroForge AI (YC W26)",
|
| 316 |
+
authors=["YC"],
|
| 317 |
+
raw_text="NeuroForge AI raises $5M to commercialize sparse attention architectures.",
|
| 318 |
+
url="https://ycombinator.com/companies/neuroforge",
|
| 319 |
+
metadata={"funding_round": "Seed", "amount": "$5M", "simulated": True},
|
| 320 |
+
)
|
| 321 |
+
]
|
| 322 |
+
|
| 323 |
+
def _simulated_social_signals(self) -> list[ResearchSignal]:
|
| 324 |
+
return [
|
| 325 |
+
ResearchSignal(
|
| 326 |
+
source=SignalSource.SOCIAL,
|
| 327 |
+
source_id="HN-4123456",
|
| 328 |
+
title="Show HN: VectorMind - Open Source Research Intelligence",
|
| 329 |
+
authors=["hn_user"],
|
| 330 |
+
raw_text="The first agentic platform for autonomous research...",
|
| 331 |
+
url="https://news.ycombinator.com/item?id=4123456",
|
| 332 |
+
metadata={"upvotes": 450, "comments": 82, "simulated": True},
|
| 333 |
+
)
|
| 334 |
+
]
|
agents/memory_agent.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Memory Agent — Long-horizon context and personalization.
|
| 2 |
+
|
| 3 |
+
Stores user interaction history, manages feedback signals,
|
| 4 |
+
and provides personalized scoring adjustments.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
from collections import defaultdict
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
from typing import Optional
|
| 13 |
+
|
| 14 |
+
from agents.base_agent import BaseAgent
|
| 15 |
+
from ingestion.schema import AgentEvent, UserFeedback
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger("vectorminds.memory_agent")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class MemoryAgent(BaseAgent):
|
| 21 |
+
"""Agent that maintains persistent context and user preferences."""
|
| 22 |
+
|
| 23 |
+
def __init__(self):
|
| 24 |
+
super().__init__("MemoryAgent")
|
| 25 |
+
|
| 26 |
+
# Working memory (replaces Redis)
|
| 27 |
+
self.working_memory: dict[str, dict] = {}
|
| 28 |
+
|
| 29 |
+
# Episodic memory — interaction history
|
| 30 |
+
self.interaction_history: list[dict] = []
|
| 31 |
+
|
| 32 |
+
# User preferences (learned from feedback)
|
| 33 |
+
self.user_preferences: dict[str, float] = defaultdict(float)
|
| 34 |
+
|
| 35 |
+
# Feedback store
|
| 36 |
+
self.feedback_log: list[UserFeedback] = []
|
| 37 |
+
|
| 38 |
+
# Blueprint cache
|
| 39 |
+
self.blueprint_cache: dict[str, dict] = {}
|
| 40 |
+
|
| 41 |
+
# Pipeline portfolio
|
| 42 |
+
self.pipeline_portfolio: list[dict] = []
|
| 43 |
+
|
| 44 |
+
def setup(self):
|
| 45 |
+
self.subscribe("reasoning.scored")
|
| 46 |
+
self.subscribe("delivery.feedback")
|
| 47 |
+
|
| 48 |
+
async def process_event(self, event: AgentEvent):
|
| 49 |
+
"""Process scored signals and feedback events."""
|
| 50 |
+
if event.topic == "reasoning.scored":
|
| 51 |
+
# Store in episodic memory
|
| 52 |
+
self.interaction_history.append({
|
| 53 |
+
"type": "signal_scored",
|
| 54 |
+
"timestamp": datetime.utcnow().isoformat(),
|
| 55 |
+
"data": event.payload,
|
| 56 |
+
})
|
| 57 |
+
# Update working memory
|
| 58 |
+
signal_id = event.payload.get("signal_id", "")
|
| 59 |
+
if signal_id:
|
| 60 |
+
self.working_memory[signal_id] = event.payload
|
| 61 |
+
|
| 62 |
+
elif event.topic == "delivery.feedback":
|
| 63 |
+
await self._process_feedback(event.payload)
|
| 64 |
+
|
| 65 |
+
async def _process_feedback(self, payload: dict):
|
| 66 |
+
"""Process user feedback to update preferences."""
|
| 67 |
+
feedback = UserFeedback(
|
| 68 |
+
target_id=payload.get("target_id", ""),
|
| 69 |
+
target_type=payload.get("target_type", "trend"),
|
| 70 |
+
action=payload.get("action", "upvote"),
|
| 71 |
+
)
|
| 72 |
+
self.feedback_log.append(feedback)
|
| 73 |
+
|
| 74 |
+
# Update user preferences based on feedback
|
| 75 |
+
categories = payload.get("categories", [])
|
| 76 |
+
weight = 1.0 if feedback.action == "upvote" else -0.5
|
| 77 |
+
|
| 78 |
+
for category in categories:
|
| 79 |
+
self.user_preferences[category] += weight
|
| 80 |
+
|
| 81 |
+
logger.info(
|
| 82 |
+
f"Feedback recorded: {feedback.action} on "
|
| 83 |
+
f"{feedback.target_type}/{feedback.target_id}"
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
def get_preference_weight(self, categories: list[str]) -> float:
|
| 87 |
+
"""Get a personalization weight based on user preferences.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
categories: List of categories for a signal/trend
|
| 91 |
+
|
| 92 |
+
Returns:
|
| 93 |
+
Weight multiplier (>1 = preferred, <1 = less preferred)
|
| 94 |
+
"""
|
| 95 |
+
if not self.user_preferences:
|
| 96 |
+
return 1.0
|
| 97 |
+
|
| 98 |
+
scores = [self.user_preferences.get(c, 0) for c in categories]
|
| 99 |
+
if not scores:
|
| 100 |
+
return 1.0
|
| 101 |
+
|
| 102 |
+
avg_pref = sum(scores) / len(scores)
|
| 103 |
+
# Normalize to a multiplier around 1.0
|
| 104 |
+
return max(0.5, min(1.5, 1.0 + avg_pref * 0.1))
|
| 105 |
+
|
| 106 |
+
def store_blueprint(self, blueprint_id: str, data: dict):
|
| 107 |
+
"""Cache a generated blueprint."""
|
| 108 |
+
self.blueprint_cache[blueprint_id] = {
|
| 109 |
+
**data,
|
| 110 |
+
"stored_at": datetime.utcnow().isoformat(),
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
def get_blueprint(self, blueprint_id: str) -> Optional[dict]:
|
| 114 |
+
"""Retrieve a cached blueprint."""
|
| 115 |
+
return self.blueprint_cache.get(blueprint_id)
|
| 116 |
+
|
| 117 |
+
def store_pipeline(self, pipeline_data: dict):
|
| 118 |
+
"""Add a pipeline to the portfolio."""
|
| 119 |
+
self.pipeline_portfolio.append({
|
| 120 |
+
**pipeline_data,
|
| 121 |
+
"stored_at": datetime.utcnow().isoformat(),
|
| 122 |
+
})
|
| 123 |
+
|
| 124 |
+
def get_stats(self) -> dict:
|
| 125 |
+
"""Get memory agent statistics."""
|
| 126 |
+
return {
|
| 127 |
+
"working_memory_size": len(self.working_memory),
|
| 128 |
+
"interaction_count": len(self.interaction_history),
|
| 129 |
+
"feedback_count": len(self.feedback_log),
|
| 130 |
+
"blueprints_cached": len(self.blueprint_cache),
|
| 131 |
+
"pipelines_stored": len(self.pipeline_portfolio),
|
| 132 |
+
"preference_categories": len(self.user_preferences),
|
| 133 |
+
"upvotes": sum(
|
| 134 |
+
1 for f in self.feedback_log if f.action == "upvote"
|
| 135 |
+
),
|
| 136 |
+
"downvotes": sum(
|
| 137 |
+
1 for f in self.feedback_log if f.action == "downvote"
|
| 138 |
+
),
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
def get_health(self) -> dict:
|
| 142 |
+
health = super().get_health()
|
| 143 |
+
health.update(self.get_stats())
|
| 144 |
+
return health
|
agents/message_bus.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Async Message Bus — Inter-agent communication layer.
|
| 2 |
+
|
| 3 |
+
Replaces Apache Kafka with asyncio queues for the hackathon MVP.
|
| 4 |
+
Typed event system with pub/sub pattern.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import asyncio
|
| 10 |
+
import logging
|
| 11 |
+
from collections import defaultdict
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from typing import Callable, Optional
|
| 14 |
+
|
| 15 |
+
import config
|
| 16 |
+
from ingestion.schema import AgentEvent
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger("vectorminds.messagebus")
|
| 19 |
+
|
| 20 |
+
try:
|
| 21 |
+
from aiokafka import AIOKafkaProducer
|
| 22 |
+
except Exception: # pragma: no cover - optional dependency
|
| 23 |
+
AIOKafkaProducer = None
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class MessageBus:
|
| 27 |
+
"""In-memory async message bus for agent communication."""
|
| 28 |
+
|
| 29 |
+
_instance: Optional["MessageBus"] = None
|
| 30 |
+
|
| 31 |
+
@classmethod
|
| 32 |
+
def get_instance(cls) -> "MessageBus":
|
| 33 |
+
if cls._instance is None:
|
| 34 |
+
cls._instance = cls()
|
| 35 |
+
return cls._instance
|
| 36 |
+
|
| 37 |
+
def __init__(self):
|
| 38 |
+
self._subscribers: dict[str, list[asyncio.Queue]] = defaultdict(list)
|
| 39 |
+
self._handlers: dict[str, list[Callable]] = defaultdict(list)
|
| 40 |
+
self._event_log: list[AgentEvent] = []
|
| 41 |
+
self._running = False
|
| 42 |
+
self._producer = None
|
| 43 |
+
self._mirror_enabled = (
|
| 44 |
+
config.MESSAGE_BUS_BACKEND == "kafka_mirror" and AIOKafkaProducer is not None
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
async def start(self):
|
| 48 |
+
"""Start optional Kafka mirror producer."""
|
| 49 |
+
if self._mirror_enabled and self._producer is None:
|
| 50 |
+
self._producer = AIOKafkaProducer(
|
| 51 |
+
bootstrap_servers=config.KAFKA_BOOTSTRAP_SERVERS,
|
| 52 |
+
)
|
| 53 |
+
await self._producer.start()
|
| 54 |
+
logger.info("Message bus Kafka mirror producer started")
|
| 55 |
+
|
| 56 |
+
async def stop(self):
|
| 57 |
+
"""Stop optional Kafka producer."""
|
| 58 |
+
if self._producer is not None:
|
| 59 |
+
await self._producer.stop()
|
| 60 |
+
self._producer = None
|
| 61 |
+
|
| 62 |
+
def subscribe(self, topic: str) -> asyncio.Queue:
|
| 63 |
+
"""Subscribe to a topic and get a queue for receiving events.
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
topic: Event topic (e.g. 'ingestion.new_signal')
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
asyncio.Queue that will receive events for this topic
|
| 70 |
+
"""
|
| 71 |
+
queue = asyncio.Queue()
|
| 72 |
+
self._subscribers[topic].append(queue)
|
| 73 |
+
logger.debug(f"New subscriber for topic '{topic}'")
|
| 74 |
+
return queue
|
| 75 |
+
|
| 76 |
+
def register_handler(self, topic: str, handler: Callable):
|
| 77 |
+
"""Register a handler function for a topic.
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
topic: Event topic
|
| 81 |
+
handler: Async callable that processes AgentEvent
|
| 82 |
+
"""
|
| 83 |
+
self._handlers[topic].append(handler)
|
| 84 |
+
logger.debug(f"Handler registered for topic '{topic}'")
|
| 85 |
+
|
| 86 |
+
async def publish(self, event: AgentEvent):
|
| 87 |
+
"""Publish an event to all subscribers of its topic.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
event: The event to publish
|
| 91 |
+
"""
|
| 92 |
+
self._event_log.append(event)
|
| 93 |
+
|
| 94 |
+
# Send to queue subscribers
|
| 95 |
+
for queue in self._subscribers.get(event.topic, []):
|
| 96 |
+
await queue.put(event)
|
| 97 |
+
|
| 98 |
+
# Call registered handlers
|
| 99 |
+
for handler in self._handlers.get(event.topic, []):
|
| 100 |
+
try:
|
| 101 |
+
await handler(event)
|
| 102 |
+
except Exception as e:
|
| 103 |
+
logger.error(f"Handler error for topic '{event.topic}': {e}")
|
| 104 |
+
|
| 105 |
+
# Mirror events to Kafka if enabled.
|
| 106 |
+
if self._producer is not None:
|
| 107 |
+
kafka_topic = (
|
| 108 |
+
f"{config.KAFKA_TOPIC_PREFIX}."
|
| 109 |
+
f"{event.topic.replace('.', '_')}"
|
| 110 |
+
)
|
| 111 |
+
try:
|
| 112 |
+
await self._producer.send_and_wait(
|
| 113 |
+
kafka_topic,
|
| 114 |
+
event.model_dump_json().encode("utf-8"),
|
| 115 |
+
)
|
| 116 |
+
except Exception as e:
|
| 117 |
+
logger.error(f"Kafka mirror publish error for '{kafka_topic}': {e}")
|
| 118 |
+
|
| 119 |
+
logger.debug(
|
| 120 |
+
f"Published event: topic='{event.topic}', "
|
| 121 |
+
f"source='{event.source_agent}'"
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
async def publish_simple(self, topic: str, source: str, payload: dict):
|
| 125 |
+
"""Convenience method to publish an event with minimal boilerplate."""
|
| 126 |
+
event = AgentEvent(
|
| 127 |
+
topic=topic,
|
| 128 |
+
source_agent=source,
|
| 129 |
+
timestamp=datetime.utcnow(),
|
| 130 |
+
payload=payload,
|
| 131 |
+
)
|
| 132 |
+
await self.publish(event)
|
| 133 |
+
|
| 134 |
+
def get_recent_events(self, topic: str = None, limit: int = 50) -> list[dict]:
|
| 135 |
+
"""Get recent events, optionally filtered by topic."""
|
| 136 |
+
events = self._event_log
|
| 137 |
+
if topic:
|
| 138 |
+
events = [e for e in events if e.topic == topic]
|
| 139 |
+
return [e.model_dump() for e in events[-limit:]]
|
| 140 |
+
|
| 141 |
+
def clear(self):
|
| 142 |
+
"""Clear all subscriptions and event log."""
|
| 143 |
+
self._subscribers.clear()
|
| 144 |
+
self._handlers.clear()
|
| 145 |
+
self._event_log.clear()
|
agents/reasoning_agent.py
ADDED
|
@@ -0,0 +1,390 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Reasoning Agent — Core intelligence agent of VectorMinds.
|
| 2 |
+
|
| 3 |
+
Performs cross-source correlation, novelty scoring, impact prediction,
|
| 4 |
+
and chain-of-thought summarization using LLM.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import logging
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
from typing import Optional
|
| 13 |
+
|
| 14 |
+
import httpx
|
| 15 |
+
|
| 16 |
+
from agents.base_agent import BaseAgent
|
| 17 |
+
from embeddings.vector_store import VectorStore
|
| 18 |
+
from ingestion.schema import AgentEvent, TrendEntry
|
| 19 |
+
import config
|
| 20 |
+
|
| 21 |
+
logger = logging.getLogger("vectorminds.reasoning_agent")
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# ─── Mock LLM Responses (for demo when no API key) ───────────
|
| 25 |
+
MOCK_BRIEFS = [
|
| 26 |
+
{
|
| 27 |
+
"technique": "State Space Models (Mamba)",
|
| 28 |
+
"brief": "A new class of sequence models that achieve transformer-quality results with linear-time complexity. Mamba introduces selective state spaces that can dynamically filter information, enabling 5x faster inference and the ability to process sequences of unlimited length. Key insight: by making the state space parameters input-dependent, the model can selectively remember or forget information.",
|
| 29 |
+
"impact": "Could fundamentally reshape the efficiency of language models, making GPT-4-class models runnable on consumer hardware.",
|
| 30 |
+
"competes_with": ["Transformers", "RWKV", "Hyena"],
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"technique": "Mixture of Experts (MoE) Scaling",
|
| 34 |
+
"brief": "Sparse MoE architectures activate only a fraction of parameters per token, enabling models with trillions of parameters to run at the cost of much smaller dense models. Recent innovations in expert routing and load balancing have made MoE practical for production deployment.",
|
| 35 |
+
"impact": "Enables 10x parameter scaling without proportional compute increase. Key enabler for next-generation foundation models.",
|
| 36 |
+
"competes_with": ["Dense Transformers", "Distillation", "Pruning"],
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"technique": "Retrieval-Augmented Generation (RAG) 2.0",
|
| 40 |
+
"brief": "Next-generation RAG systems move beyond simple vector similarity search. They incorporate graph-based retrieval, multi-hop reasoning chains, and learned retrieval strategies that adapt to the query complexity. Self-RAG and Corrective RAG add self-reflection loops that verify retrieved context relevance.",
|
| 41 |
+
"impact": "Dramatically reduces hallucination in production LLM applications. Enables enterprise AI deployment with verifiable sources.",
|
| 42 |
+
"competes_with": ["Fine-tuning", "In-context Learning", "Knowledge Graphs"],
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"technique": "Diffusion Transformers (DiT)",
|
| 46 |
+
"brief": "Replacing the U-Net backbone in diffusion models with transformer architectures. DiT achieves state-of-the-art image generation quality while being more scalable and amenable to the same scaling laws that power language models.",
|
| 47 |
+
"impact": "Unifies the image and language generation paradigms under a single architecture family, enabling multimodal foundation models.",
|
| 48 |
+
"competes_with": ["U-Net Diffusion", "GANs", "Autoregressive Image Models"],
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"technique": "Constitutional AI & RLHF Alternatives",
|
| 52 |
+
"brief": "New alignment approaches that reduce dependence on expensive human feedback. DPO (Direct Preference Optimization) eliminates the reward model entirely. Constitutional AI uses AI-generated critiques to self-improve. KTO (Kahneman-Tversky Optimization) requires only binary good/bad labels.",
|
| 53 |
+
"impact": "Democratizes model alignment — any team can align models without a large annotation workforce.",
|
| 54 |
+
"competes_with": ["RLHF", "PPO", "Manual Prompt Engineering"],
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"technique": "Multi-Agent LLM Systems",
|
| 58 |
+
"brief": "Frameworks where multiple LLM instances collaborate as specialized agents to solve complex tasks. Each agent has defined roles, tools, and memory. Key innovations: hierarchical planning agents, critic/validator agents, and shared workspace protocols.",
|
| 59 |
+
"impact": "Moves AI from single-turn Q&A to autonomous multi-step problem solving. Foundation for AI software engineers and research assistants.",
|
| 60 |
+
"competes_with": ["Single-agent CoT", "Function Calling", "Manual Workflows"],
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"technique": "LoRA & Parameter-Efficient Fine-Tuning",
|
| 64 |
+
"brief": "Low-Rank Adaptation enables fine-tuning large models by training only small adapter matrices. QLoRA extends this with 4-bit quantization, enabling fine-tuning of 65B models on a single GPU. DoRA and LoRA+ improve convergence and final quality.",
|
| 65 |
+
"impact": "Every organization can now customize foundation models for their domain at minimal cost. Enables private, specialized AI.",
|
| 66 |
+
"competes_with": ["Full Fine-tuning", "Prompt Tuning", "In-context Learning"],
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"technique": "Vision-Language Models (VLMs)",
|
| 70 |
+
"brief": "Models that natively understand both images and text, enabling visual question answering, image-grounded dialogue, and document understanding. LLaVA, Qwen-VL, and GPT-4V demonstrate that vision encoders can be efficiently fused with language models.",
|
| 71 |
+
"impact": "Unlocks multimodal AI applications: automated document processing, visual inspection, accessibility tools, and creative design assistants.",
|
| 72 |
+
"competes_with": ["OCR + LLM Pipelines", "CLIP", "Specialized Vision Models"],
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"technique": "Structured Output Generation",
|
| 76 |
+
"brief": "Constrained decoding techniques that guarantee LLM outputs conform to a specified schema (JSON, SQL, code). Grammar-guided generation and token masking ensure 100% format compliance without post-processing.",
|
| 77 |
+
"impact": "Makes LLMs reliable for production software integration. Eliminates the 'parsing problem' that plagues LLM-powered applications.",
|
| 78 |
+
"competes_with": ["Regex Post-processing", "Retry Loops", "Fine-tuning for Format"],
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"technique": "Speculative Decoding & Inference Optimization",
|
| 82 |
+
"brief": "Techniques that accelerate LLM inference without quality loss. Speculative decoding uses a small draft model to propose tokens that a large model verifies in parallel. Combined with KV-cache optimization, flash attention, and quantization, these achieve 3-5x speedup.",
|
| 83 |
+
"impact": "Makes large model deployment economically viable. Reduces per-token cost to enable real-time conversational AI at scale.",
|
| 84 |
+
"competes_with": ["Model Distillation", "Pruning", "Smaller Models"],
|
| 85 |
+
},
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
class ReasoningAgent(BaseAgent):
|
| 90 |
+
"""Agent that performs cross-source analysis, scoring, and summarization."""
|
| 91 |
+
|
| 92 |
+
def __init__(self):
|
| 93 |
+
super().__init__("ReasoningAgent")
|
| 94 |
+
self.vector_store = VectorStore.get_instance()
|
| 95 |
+
self.trends: dict[str, TrendEntry] = {}
|
| 96 |
+
self._mock_brief_idx = 0
|
| 97 |
+
|
| 98 |
+
def setup(self):
|
| 99 |
+
self.subscribe("ingestion.new_signal")
|
| 100 |
+
|
| 101 |
+
async def process_event(self, event: AgentEvent):
|
| 102 |
+
"""Process new research signals — score and cluster them."""
|
| 103 |
+
if event.topic == "ingestion.new_signal":
|
| 104 |
+
signal_data = event.payload
|
| 105 |
+
logger.debug(
|
| 106 |
+
f"Processing signal: {signal_data.get('title', 'unknown')}"
|
| 107 |
+
)
|
| 108 |
+
# Impact scoring happens in batch via analyze_trends()
|
| 109 |
+
|
| 110 |
+
async def analyze_trends(self) -> list[TrendEntry]:
|
| 111 |
+
"""Analyze all stored signals and produce a ranked trend leaderboard.
|
| 112 |
+
|
| 113 |
+
This is the core intelligence function that:
|
| 114 |
+
1. Clusters related signals
|
| 115 |
+
2. Scores each cluster for emergence and impact
|
| 116 |
+
3. Generates technical briefs
|
| 117 |
+
4. Returns ranked trends
|
| 118 |
+
|
| 119 |
+
Returns:
|
| 120 |
+
Sorted list of TrendEntry objects
|
| 121 |
+
"""
|
| 122 |
+
self._status = "analyzing"
|
| 123 |
+
logger.info("Running trend analysis...")
|
| 124 |
+
|
| 125 |
+
payloads = self.vector_store.get_all_payloads(limit=500)
|
| 126 |
+
if not payloads:
|
| 127 |
+
self._status = "running"
|
| 128 |
+
return []
|
| 129 |
+
|
| 130 |
+
# Group signals by technique/topic (simplified clustering via categories)
|
| 131 |
+
technique_clusters: dict[str, list[dict]] = {}
|
| 132 |
+
for p in payloads:
|
| 133 |
+
# Extract primary technique from categories and title
|
| 134 |
+
categories = p.get("categories", [])
|
| 135 |
+
title = p.get("title", "")
|
| 136 |
+
|
| 137 |
+
# Use first category as cluster key, or title keywords
|
| 138 |
+
key = self._extract_technique_key(title, categories)
|
| 139 |
+
if key not in technique_clusters:
|
| 140 |
+
technique_clusters[key] = []
|
| 141 |
+
technique_clusters[key].append(p)
|
| 142 |
+
|
| 143 |
+
# Score each cluster
|
| 144 |
+
trends = []
|
| 145 |
+
for idx, (technique, signals) in enumerate(technique_clusters.items()):
|
| 146 |
+
novelty_scores = [s.get("novelty_score", 0) for s in signals]
|
| 147 |
+
avg_novelty = sum(novelty_scores) / len(novelty_scores) if novelty_scores else 0
|
| 148 |
+
|
| 149 |
+
# Compute emergence score
|
| 150 |
+
github_stars = sum(
|
| 151 |
+
s.get("metadata", {}).get("stars", 0)
|
| 152 |
+
for s in signals
|
| 153 |
+
if s.get("source") == "github"
|
| 154 |
+
)
|
| 155 |
+
paper_count = sum(1 for s in signals if s.get("source") == "arxiv")
|
| 156 |
+
|
| 157 |
+
emergence_score = self._compute_emergence_score(
|
| 158 |
+
avg_novelty, len(signals), github_stars, paper_count
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
# Impact prediction (simplified heuristic for MVP)
|
| 162 |
+
impact_score = self._predict_impact(
|
| 163 |
+
avg_novelty, len(signals), github_stars
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
# Get or generate technical brief
|
| 167 |
+
brief_data = self._get_mock_brief(idx)
|
| 168 |
+
|
| 169 |
+
trend = TrendEntry(
|
| 170 |
+
rank=0, # Will be set after sorting
|
| 171 |
+
technique_name=brief_data["technique"] if config.USE_MOCK_LLM else technique,
|
| 172 |
+
description=brief_data["brief"],
|
| 173 |
+
emergence_score=round(emergence_score, 3),
|
| 174 |
+
novelty_score=round(avg_novelty, 3),
|
| 175 |
+
impact_score=round(impact_score, 3),
|
| 176 |
+
mainstream_eta_months=self._estimate_eta(impact_score),
|
| 177 |
+
confidence=round(min(0.95, 0.5 + impact_score * 0.4), 2),
|
| 178 |
+
source_signals={
|
| 179 |
+
"arxiv_papers": paper_count,
|
| 180 |
+
"github_repos": len(signals) - paper_count,
|
| 181 |
+
"total_github_stars": github_stars,
|
| 182 |
+
},
|
| 183 |
+
competitive_landscape=brief_data.get("competes_with", []),
|
| 184 |
+
risk_factors=self._assess_risks(signals),
|
| 185 |
+
related_techniques=brief_data.get("competes_with", [])[:3],
|
| 186 |
+
paper_count=paper_count,
|
| 187 |
+
github_stars=github_stars,
|
| 188 |
+
signal_ids=[s.get("id", "") for s in signals],
|
| 189 |
+
)
|
| 190 |
+
trends.append(trend)
|
| 191 |
+
|
| 192 |
+
# Sort by emergence score and assign ranks
|
| 193 |
+
trends.sort(key=lambda t: t.emergence_score, reverse=True)
|
| 194 |
+
for i, trend in enumerate(trends):
|
| 195 |
+
trend.rank = i + 1
|
| 196 |
+
|
| 197 |
+
# Store trends
|
| 198 |
+
self.trends = {t.id: t for t in trends}
|
| 199 |
+
|
| 200 |
+
self._status = "running"
|
| 201 |
+
logger.info(f"Trend analysis complete: {len(trends)} techniques ranked")
|
| 202 |
+
return trends[:20] # Return top 20
|
| 203 |
+
|
| 204 |
+
async def generate_technical_brief(
|
| 205 |
+
self, technique: str, context: str = ""
|
| 206 |
+
) -> str:
|
| 207 |
+
"""Generate a detailed technical brief using LLM or mock.
|
| 208 |
+
|
| 209 |
+
Args:
|
| 210 |
+
technique: Technique name
|
| 211 |
+
context: Additional context (paper abstracts, etc.)
|
| 212 |
+
|
| 213 |
+
Returns:
|
| 214 |
+
Formatted technical brief text
|
| 215 |
+
"""
|
| 216 |
+
if config.USE_MOCK_LLM or not config.LLM_API_KEY:
|
| 217 |
+
return self._generate_mock_brief(technique)
|
| 218 |
+
|
| 219 |
+
try:
|
| 220 |
+
async with httpx.AsyncClient(timeout=30.0) as client:
|
| 221 |
+
prompt = (
|
| 222 |
+
"You are a senior AI research analyst at VectorMinds. "
|
| 223 |
+
"Generate a structured technical brief for an emerging AI technique. "
|
| 224 |
+
"Include: core technique description, key insight, why it matters, "
|
| 225 |
+
"what it enables, what it competes with, and 12-month impact prediction.\n\n"
|
| 226 |
+
f"Technique: {technique}\n\n"
|
| 227 |
+
f"Context from recent papers:\n{context[:2000]}"
|
| 228 |
+
)
|
| 229 |
+
resp = await client.post(
|
| 230 |
+
f"{config.GEMINI_BASE_URL}/models/{config.LLM_MODEL}:generateContent",
|
| 231 |
+
params={"key": config.LLM_API_KEY},
|
| 232 |
+
headers={"Content-Type": "application/json"},
|
| 233 |
+
json={
|
| 234 |
+
"contents": [{"parts": [{"text": prompt}]}],
|
| 235 |
+
"generationConfig": {
|
| 236 |
+
"temperature": 0.7,
|
| 237 |
+
"maxOutputTokens": 4096,
|
| 238 |
+
},
|
| 239 |
+
},
|
| 240 |
+
)
|
| 241 |
+
resp.raise_for_status()
|
| 242 |
+
data = resp.json()
|
| 243 |
+
candidates = data.get("candidates", [])
|
| 244 |
+
if not candidates:
|
| 245 |
+
raise ValueError("No Gemini candidates returned")
|
| 246 |
+
parts = candidates[0].get("content", {}).get("parts", [])
|
| 247 |
+
text = "".join(p.get("text", "") for p in parts if isinstance(p, dict)).strip()
|
| 248 |
+
if not text:
|
| 249 |
+
raise ValueError("Empty Gemini response text")
|
| 250 |
+
return text
|
| 251 |
+
except Exception as e:
|
| 252 |
+
logger.error(f"LLM brief generation failed: {e}")
|
| 253 |
+
return self._generate_mock_brief(technique)
|
| 254 |
+
|
| 255 |
+
def _extract_technique_key(self, title: str, categories: list) -> str:
|
| 256 |
+
"""Extract a technique cluster key from title and categories.
|
| 257 |
+
|
| 258 |
+
Priority:
|
| 259 |
+
1. Match a curated keyword list against the title.
|
| 260 |
+
2. Match against descriptive category tokens (skipping arXiv class codes
|
| 261 |
+
like ``cs.LG`` and source labels like ``arxiv``/``blog``).
|
| 262 |
+
3. Fallback to ``General AI`` so we never surface an internal source label
|
| 263 |
+
as a "technique".
|
| 264 |
+
"""
|
| 265 |
+
title_lower = (title or "").lower()
|
| 266 |
+
|
| 267 |
+
keywords = [
|
| 268 |
+
("transformer", "Transformer Architectures"),
|
| 269 |
+
("attention", "Attention Mechanisms"),
|
| 270 |
+
("diffusion", "Diffusion Models"),
|
| 271 |
+
("rlhf", "RLHF Alignment"),
|
| 272 |
+
("reinforcement", "Reinforcement Learning"),
|
| 273 |
+
("graph neural", "Graph Neural Networks"),
|
| 274 |
+
("federated", "Federated Learning"),
|
| 275 |
+
("quantization", "Quantization"),
|
| 276 |
+
("pruning", "Model Pruning"),
|
| 277 |
+
("distillation", "Knowledge Distillation"),
|
| 278 |
+
("mixture of experts", "Mixture of Experts"),
|
| 279 |
+
("moe", "Mixture of Experts"),
|
| 280 |
+
("retrieval-augmented", "Retrieval-Augmented Generation"),
|
| 281 |
+
("rag", "Retrieval-Augmented Generation"),
|
| 282 |
+
("lora", "LoRA / Adapter Tuning"),
|
| 283 |
+
("fine-tun", "Parameter-Efficient Fine-Tuning"),
|
| 284 |
+
("multimodal", "Multimodal Models"),
|
| 285 |
+
("vision-language", "Vision-Language Models"),
|
| 286 |
+
("vision language", "Vision-Language Models"),
|
| 287 |
+
("vision", "Vision Models"),
|
| 288 |
+
("agent", "AI Agents"),
|
| 289 |
+
("state space", "State Space Models"),
|
| 290 |
+
("mamba", "State Space Models"),
|
| 291 |
+
("language model", "Large Language Models"),
|
| 292 |
+
("llm", "Large Language Models"),
|
| 293 |
+
("embedding", "Embedding Models"),
|
| 294 |
+
("contrastive", "Contrastive Learning"),
|
| 295 |
+
("self-supervised", "Self-Supervised Learning"),
|
| 296 |
+
("gan", "Generative Adversarial Networks"),
|
| 297 |
+
("speech", "Speech Models"),
|
| 298 |
+
("audio", "Audio Models"),
|
| 299 |
+
]
|
| 300 |
+
|
| 301 |
+
for kw, label in keywords:
|
| 302 |
+
if kw in title_lower:
|
| 303 |
+
return label
|
| 304 |
+
|
| 305 |
+
# Categories: ignore arXiv class codes (e.g. cs.LG) and source labels.
|
| 306 |
+
IGNORE_CATEGORY_LABELS = {
|
| 307 |
+
"arxiv", "github", "patents", "patent", "startup", "startups",
|
| 308 |
+
"social", "blog", "blogs", "hacker-news", "rss",
|
| 309 |
+
}
|
| 310 |
+
if categories:
|
| 311 |
+
for cat in categories:
|
| 312 |
+
if not isinstance(cat, str):
|
| 313 |
+
continue
|
| 314 |
+
low = cat.lower().strip()
|
| 315 |
+
if not low:
|
| 316 |
+
continue
|
| 317 |
+
if low in IGNORE_CATEGORY_LABELS:
|
| 318 |
+
continue
|
| 319 |
+
# Skip arXiv subject codes such as cs.LG / stat.ML.
|
| 320 |
+
if "." in low and len(low) <= 8 and low.split(".")[0].isalpha():
|
| 321 |
+
continue
|
| 322 |
+
return cat.replace("-", " ").title()
|
| 323 |
+
|
| 324 |
+
return "General AI"
|
| 325 |
+
|
| 326 |
+
def _compute_emergence_score(
|
| 327 |
+
self,
|
| 328 |
+
avg_novelty: float,
|
| 329 |
+
signal_count: int,
|
| 330 |
+
github_stars: int,
|
| 331 |
+
paper_count: int,
|
| 332 |
+
) -> float:
|
| 333 |
+
"""Compute emergence score (composite metric)."""
|
| 334 |
+
# Weighted combination of multiple signals
|
| 335 |
+
novelty_component = avg_novelty * 0.35
|
| 336 |
+
volume_component = min(1.0, signal_count / 20) * 0.25
|
| 337 |
+
github_component = min(1.0, github_stars / 5000) * 0.25
|
| 338 |
+
academic_component = min(1.0, paper_count / 10) * 0.15
|
| 339 |
+
|
| 340 |
+
return novelty_component + volume_component + github_component + academic_component
|
| 341 |
+
|
| 342 |
+
def _predict_impact(
|
| 343 |
+
self, avg_novelty: float, signal_count: int, github_stars: int
|
| 344 |
+
) -> float:
|
| 345 |
+
"""Simplified impact prediction (MVP heuristic)."""
|
| 346 |
+
# In production, this would be an XGBoost ensemble
|
| 347 |
+
base = avg_novelty * 0.4
|
| 348 |
+
volume_signal = min(1.0, signal_count / 15) * 0.3
|
| 349 |
+
community_signal = min(1.0, github_stars / 3000) * 0.3
|
| 350 |
+
return max(0.0, min(1.0, base + volume_signal + community_signal))
|
| 351 |
+
|
| 352 |
+
def _estimate_eta(self, impact_score: float) -> int:
|
| 353 |
+
"""Estimate deterministic mainstream horizon buckets (6/12/24 months)."""
|
| 354 |
+
if impact_score >= 0.75:
|
| 355 |
+
return 6
|
| 356 |
+
if impact_score >= 0.5:
|
| 357 |
+
return 12
|
| 358 |
+
return 24
|
| 359 |
+
|
| 360 |
+
def _assess_risks(self, signals: list[dict]) -> list[str]:
|
| 361 |
+
"""Generate risk factors for a technique cluster."""
|
| 362 |
+
risks = []
|
| 363 |
+
paper_count = sum(1 for s in signals if s.get("source") == "arxiv")
|
| 364 |
+
|
| 365 |
+
if paper_count < 3:
|
| 366 |
+
risks.append("Limited academic validation (few papers)")
|
| 367 |
+
if not any(s.get("source") == "github" for s in signals):
|
| 368 |
+
risks.append("No open-source implementations detected")
|
| 369 |
+
|
| 370 |
+
# Default risks
|
| 371 |
+
risks.extend([
|
| 372 |
+
"Compute requirements may limit accessibility",
|
| 373 |
+
"Dataset licensing considerations for commercial use",
|
| 374 |
+
])
|
| 375 |
+
return risks[:4]
|
| 376 |
+
|
| 377 |
+
def _get_mock_brief(self, idx: int) -> dict:
|
| 378 |
+
"""Get a pre-written mock brief for demo."""
|
| 379 |
+
return MOCK_BRIEFS[idx % len(MOCK_BRIEFS)]
|
| 380 |
+
|
| 381 |
+
def _generate_mock_brief(self, technique: str) -> str:
|
| 382 |
+
"""Generate a mock technical brief."""
|
| 383 |
+
self._mock_brief_idx = (self._mock_brief_idx + 1) % len(MOCK_BRIEFS)
|
| 384 |
+
brief = MOCK_BRIEFS[self._mock_brief_idx]
|
| 385 |
+
return (
|
| 386 |
+
f"## Technical Brief: {technique}\n\n"
|
| 387 |
+
f"**Core Technique:** {brief['brief']}\n\n"
|
| 388 |
+
f"**Impact:** {brief['impact']}\n\n"
|
| 389 |
+
f"**Competes With:** {', '.join(brief['competes_with'])}"
|
| 390 |
+
)
|
agents/retraining_agent.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Retraining Agent — Autonomous Model Drift & Promotion Engine.
|
| 2 |
+
|
| 3 |
+
Follows Section 4.3: Monitors for model drift and orchestrates
|
| 4 |
+
automated retraining/promotion cycles.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
import asyncio
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
import numpy as np
|
| 11 |
+
|
| 12 |
+
from agents.base_agent import BaseAgent
|
| 13 |
+
from ingestion.schema import AgentEvent
|
| 14 |
+
import config
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger("vectormind.retraining")
|
| 17 |
+
|
| 18 |
+
class RetrainingAgent(BaseAgent):
|
| 19 |
+
"""Agent that handles autonomous model maintenance."""
|
| 20 |
+
|
| 21 |
+
def __init__(self):
|
| 22 |
+
super().__init__("RetrainingAgent")
|
| 23 |
+
self.last_retraining = None
|
| 24 |
+
self.drift_history = []
|
| 25 |
+
self._status = "monitoring"
|
| 26 |
+
self.model_version = "v1.1.0"
|
| 27 |
+
self._baseline_metrics = {
|
| 28 |
+
"accuracy": 0.78,
|
| 29 |
+
"f1": 0.75,
|
| 30 |
+
"latency_ms": 180.0,
|
| 31 |
+
}
|
| 32 |
+
self._last_candidate_metrics = None
|
| 33 |
+
|
| 34 |
+
def setup(self):
|
| 35 |
+
# Subscribe to new signals to monitor drift
|
| 36 |
+
self.subscribe("ingestion.new_signal")
|
| 37 |
+
|
| 38 |
+
async def process_event(self, event: AgentEvent):
|
| 39 |
+
"""Monitor incoming signals for novelty distribution drift."""
|
| 40 |
+
if event.topic == "ingestion.new_signal":
|
| 41 |
+
novelty_score = event.payload.get("novelty_score", 0)
|
| 42 |
+
self.drift_history.append(novelty_score)
|
| 43 |
+
|
| 44 |
+
# Check for drift every 100 signals
|
| 45 |
+
if len(self.drift_history) >= 100:
|
| 46 |
+
await self.check_drift()
|
| 47 |
+
|
| 48 |
+
async def check_drift(self):
|
| 49 |
+
"""Analyze novelty distribution to detect model staleness."""
|
| 50 |
+
avg_novelty = np.mean(self.drift_history)
|
| 51 |
+
logger.info(f"Checking model drift. Avg Novelty: {avg_novelty:.4f}")
|
| 52 |
+
|
| 53 |
+
# If novelty is too low, it means our vector store is too crowded
|
| 54 |
+
# with similar content, and the model might need recalibration.
|
| 55 |
+
if avg_novelty < config.RETRAIN_DRIFT_THRESHOLD:
|
| 56 |
+
logger.warning("Significant model drift detected! Novelty threshold breached.")
|
| 57 |
+
await self.trigger_retraining()
|
| 58 |
+
|
| 59 |
+
self.drift_history = []
|
| 60 |
+
|
| 61 |
+
def _evaluate_quality_gates(self, candidate_metrics: dict) -> tuple[bool, list[str]]:
|
| 62 |
+
"""Evaluate if candidate model satisfies production promotion rules."""
|
| 63 |
+
reasons = []
|
| 64 |
+
accuracy = float(candidate_metrics.get("accuracy", 0.0))
|
| 65 |
+
f1 = float(candidate_metrics.get("f1", 0.0))
|
| 66 |
+
latency = float(candidate_metrics.get("latency_ms", 10_000.0))
|
| 67 |
+
|
| 68 |
+
if accuracy < config.RETRAIN_MIN_ACCURACY:
|
| 69 |
+
reasons.append(f"accuracy below threshold ({accuracy:.3f} < {config.RETRAIN_MIN_ACCURACY:.3f})")
|
| 70 |
+
if f1 < config.RETRAIN_MIN_F1:
|
| 71 |
+
reasons.append(f"f1 below threshold ({f1:.3f} < {config.RETRAIN_MIN_F1:.3f})")
|
| 72 |
+
if latency > config.RETRAIN_MAX_LATENCY_MS:
|
| 73 |
+
reasons.append(f"latency above threshold ({latency:.1f} > {config.RETRAIN_MAX_LATENCY_MS:.1f})")
|
| 74 |
+
|
| 75 |
+
baseline_accuracy = float(self._baseline_metrics.get("accuracy", 0.0))
|
| 76 |
+
if (accuracy - baseline_accuracy) < config.RETRAIN_MIN_IMPROVEMENT:
|
| 77 |
+
reasons.append(
|
| 78 |
+
f"accuracy improvement too small ({accuracy - baseline_accuracy:.3f} < {config.RETRAIN_MIN_IMPROVEMENT:.3f})"
|
| 79 |
+
)
|
| 80 |
+
return len(reasons) == 0, reasons
|
| 81 |
+
|
| 82 |
+
def _next_model_version(self, promoted: bool) -> str:
|
| 83 |
+
"""Generate next semantic-like version according to promotion result."""
|
| 84 |
+
parts = self.model_version.lstrip("v").split(".")
|
| 85 |
+
major, minor, patch = [int(p) for p in (parts + ["0", "0", "0"])[:3]]
|
| 86 |
+
if promoted:
|
| 87 |
+
minor += 1
|
| 88 |
+
patch = 0
|
| 89 |
+
else:
|
| 90 |
+
patch += 1
|
| 91 |
+
return f"v{major}.{minor}.{patch}"
|
| 92 |
+
|
| 93 |
+
async def trigger_retraining(self, candidate_metrics: dict | None = None):
|
| 94 |
+
"""Orchestrate the retraining and promotion cycle."""
|
| 95 |
+
self._status = "retraining"
|
| 96 |
+
logger.info("Initiating autonomous retraining cycle...")
|
| 97 |
+
|
| 98 |
+
# 1. Snapshot vector store
|
| 99 |
+
# 2. Re-calculate embeddings with updated context
|
| 100 |
+
# 3. Validate new model performance
|
| 101 |
+
|
| 102 |
+
await asyncio.sleep(2)
|
| 103 |
+
|
| 104 |
+
if candidate_metrics is None:
|
| 105 |
+
# Non-mock deterministic default candidate using recent drift context.
|
| 106 |
+
avg_novelty = float(np.mean(self.drift_history)) if self.drift_history else 0.25
|
| 107 |
+
candidate_metrics = {
|
| 108 |
+
"accuracy": round(max(0.7, 0.84 - (0.4 - min(avg_novelty, 0.4))), 3),
|
| 109 |
+
"f1": 0.78,
|
| 110 |
+
"latency_ms": 145.0,
|
| 111 |
+
}
|
| 112 |
+
self._last_candidate_metrics = candidate_metrics
|
| 113 |
+
passed, reasons = self._evaluate_quality_gates(candidate_metrics)
|
| 114 |
+
|
| 115 |
+
self.last_retraining = datetime.utcnow()
|
| 116 |
+
self._status = "monitoring"
|
| 117 |
+
next_version = self._next_model_version(promoted=passed)
|
| 118 |
+
|
| 119 |
+
if passed:
|
| 120 |
+
self.model_version = next_version
|
| 121 |
+
self._baseline_metrics = candidate_metrics
|
| 122 |
+
logger.info("Retraining complete. New model promoted to production.")
|
| 123 |
+
await self.publish("model.promoted", {
|
| 124 |
+
"timestamp": self.last_retraining.isoformat(),
|
| 125 |
+
"new_version": self.model_version,
|
| 126 |
+
"metrics": candidate_metrics,
|
| 127 |
+
"promotion_policy": "quality_gates_passed",
|
| 128 |
+
})
|
| 129 |
+
else:
|
| 130 |
+
self.model_version = next_version
|
| 131 |
+
logger.warning("Retraining candidate rejected by quality gates: %s", "; ".join(reasons))
|
| 132 |
+
await self.publish("model.retraining_failed", {
|
| 133 |
+
"timestamp": self.last_retraining.isoformat(),
|
| 134 |
+
"candidate_version": self.model_version,
|
| 135 |
+
"metrics": candidate_metrics,
|
| 136 |
+
"reasons": reasons,
|
| 137 |
+
"promotion_policy": "quality_gates_failed",
|
| 138 |
+
})
|
| 139 |
+
|
| 140 |
+
def get_health(self) -> dict:
|
| 141 |
+
health = super().get_health()
|
| 142 |
+
health.update({
|
| 143 |
+
"last_retraining": self.last_retraining.isoformat() if self.last_retraining else None,
|
| 144 |
+
"drift_status": "stable" if len(self.drift_history) < 50 else "analyzing",
|
| 145 |
+
"model_version": self.model_version,
|
| 146 |
+
"baseline_metrics": self._baseline_metrics,
|
| 147 |
+
"last_candidate_metrics": self._last_candidate_metrics,
|
| 148 |
+
})
|
| 149 |
+
return health
|
config.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""VectorMind Configuration Module.
|
| 2 |
+
|
| 3 |
+
Central configuration for all platform settings, loaded from environment
|
| 4 |
+
variables with sensible defaults for hackathon demo mode.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
|
| 11 |
+
load_dotenv(override=True)
|
| 12 |
+
|
| 13 |
+
# ─── Base Paths ───────────────────────────────────────────────
|
| 14 |
+
BASE_DIR = Path(__file__).resolve().parent
|
| 15 |
+
|
| 16 |
+
# When running on Hugging Face Spaces, /data is the only writable persistent
|
| 17 |
+
# mount. Falling back to backend/data keeps local dev unchanged.
|
| 18 |
+
_HF_DATA = Path("/data")
|
| 19 |
+
DATA_DIR = (
|
| 20 |
+
_HF_DATA
|
| 21 |
+
if os.getenv("HF_DEPLOYMENT", "").lower() == "true" and _HF_DATA.exists()
|
| 22 |
+
else BASE_DIR / "data"
|
| 23 |
+
)
|
| 24 |
+
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
| 25 |
+
DB_PATH = DATA_DIR / "vectormind.db"
|
| 26 |
+
|
| 27 |
+
# ─── API Keys (ALL FREE) ─────────────────────────────────────
|
| 28 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
|
| 29 |
+
# Backward-compatible fallback so older .env files still run.
|
| 30 |
+
LLM_API_KEY = GEMINI_API_KEY or os.getenv("GROQ_API_KEY", "")
|
| 31 |
+
TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "")
|
| 32 |
+
TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "")
|
| 33 |
+
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "")
|
| 34 |
+
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN", "") # optional, for higher rate limits
|
| 35 |
+
KAGGLE_USERNAME = os.getenv("KAGGLE_USERNAME", "")
|
| 36 |
+
KAGGLE_KEY = os.getenv("KAGGLE_KEY", "")
|
| 37 |
+
|
| 38 |
+
# ─── LLM Settings (Gemini) ───────────────────────────────────
|
| 39 |
+
GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"
|
| 40 |
+
LLM_MODEL = os.getenv("LLM_MODEL", "gemini-2.5-flash-preview-05-20")
|
| 41 |
+
USE_MOCK_LLM = os.getenv("USE_MOCK_LLM", "true").lower() == "true"
|
| 42 |
+
|
| 43 |
+
# ─── Embedding Model (Free — runs locally) ───────────────────
|
| 44 |
+
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "BAAI/bge-small-en-v1.5")
|
| 45 |
+
EMBEDDING_DIM = 384 # bge-small-en-v1.5 dimension
|
| 46 |
+
|
| 47 |
+
# ─── Qdrant Settings (In-Memory — Free) ──────────────────────
|
| 48 |
+
QDRANT_COLLECTION = "research_signals"
|
| 49 |
+
QDRANT_HOST = os.getenv("QDRANT_HOST", "") # empty = in-memory mode
|
| 50 |
+
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
|
| 51 |
+
|
| 52 |
+
# ─── Distributed Infra (Docker/local) ────────────────────────
|
| 53 |
+
DB_BACKEND = os.getenv("DB_BACKEND", "sqlite").lower() # sqlite|postgres
|
| 54 |
+
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "localhost")
|
| 55 |
+
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
|
| 56 |
+
POSTGRES_DB = os.getenv("POSTGRES_DB", "vectormind")
|
| 57 |
+
POSTGRES_USER = os.getenv("POSTGRES_USER", "vectormind")
|
| 58 |
+
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "vectormind")
|
| 59 |
+
POSTGRES_DSN = os.getenv(
|
| 60 |
+
"POSTGRES_DSN",
|
| 61 |
+
f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}",
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
|
| 65 |
+
STATE_STORE_BACKEND = os.getenv("STATE_STORE_BACKEND", "sqlite").lower() # sqlite|redis
|
| 66 |
+
|
| 67 |
+
MESSAGE_BUS_BACKEND = os.getenv("MESSAGE_BUS_BACKEND", "in_memory").lower() # in_memory|kafka_mirror
|
| 68 |
+
KAFKA_BOOTSTRAP_SERVERS = os.getenv("KAFKA_BOOTSTRAP_SERVERS", "localhost:9092")
|
| 69 |
+
KAFKA_TOPIC_PREFIX = os.getenv("KAFKA_TOPIC_PREFIX", "vectormind")
|
| 70 |
+
|
| 71 |
+
# ─── Pipeline Runtime (production knobs) ─────────────────────
|
| 72 |
+
PIPELINE_RUN_TIMEOUT_SECONDS = int(os.getenv("PIPELINE_RUN_TIMEOUT_SECONDS", "1800"))
|
| 73 |
+
PIPELINE_MAX_CONCURRENT_RUNS = int(os.getenv("PIPELINE_MAX_CONCURRENT_RUNS", "2"))
|
| 74 |
+
PIPELINE_MAX_RETRIES = int(os.getenv("PIPELINE_MAX_RETRIES", "1"))
|
| 75 |
+
PIPELINE_RETRY_BACKOFF_SECONDS = int(os.getenv("PIPELINE_RETRY_BACKOFF_SECONDS", "5"))
|
| 76 |
+
|
| 77 |
+
# ─── Novelty Scoring (Section 4.2) ───────────────────────────
|
| 78 |
+
NOVELTY_K_NEIGHBORS = 50
|
| 79 |
+
NOVELTY_MEAN_WEIGHT = 0.6
|
| 80 |
+
NOVELTY_MIN_WEIGHT = 0.4
|
| 81 |
+
NOVELTY_TEMPORAL_DISCOUNT = 0.7
|
| 82 |
+
NOVELTY_TEMPORAL_WINDOW_HOURS = 72
|
| 83 |
+
|
| 84 |
+
# ─── Impact Prediction ───────────────────────────────────────
|
| 85 |
+
IMPACT_HIGH_THRESHOLD = 0.75
|
| 86 |
+
IMPACT_MEDIUM_THRESHOLD = 0.50
|
| 87 |
+
|
| 88 |
+
# ─── Retraining Promotion Gates ───────────────────────────────
|
| 89 |
+
RETRAIN_DRIFT_THRESHOLD = float(os.getenv("RETRAIN_DRIFT_THRESHOLD", "0.30"))
|
| 90 |
+
RETRAIN_MIN_ACCURACY = float(os.getenv("RETRAIN_MIN_ACCURACY", "0.78"))
|
| 91 |
+
RETRAIN_MIN_F1 = float(os.getenv("RETRAIN_MIN_F1", "0.75"))
|
| 92 |
+
RETRAIN_MAX_LATENCY_MS = float(os.getenv("RETRAIN_MAX_LATENCY_MS", "180"))
|
| 93 |
+
RETRAIN_MIN_IMPROVEMENT = float(os.getenv("RETRAIN_MIN_IMPROVEMENT", "0.01"))
|
| 94 |
+
|
| 95 |
+
# ─── Ingestion Settings ─────────────────────���────────────────
|
| 96 |
+
ARXIV_CATEGORIES = ["cs.LG", "cs.AI", "cs.CL", "cs.CV", "cs.NE"]
|
| 97 |
+
ARXIV_MAX_RESULTS = 50
|
| 98 |
+
GITHUB_TRENDING_LANGUAGES = ["python", "jupyter-notebook"]
|
| 99 |
+
GITHUB_MAX_RESULTS = 30
|
| 100 |
+
INGESTION_INTERVAL_SECONDS = 3600 # 1 hour
|
| 101 |
+
PATENTS_MAX_RESULTS = int(os.getenv("PATENTS_MAX_RESULTS", "20"))
|
| 102 |
+
STARTUPS_MAX_RESULTS = int(os.getenv("STARTUPS_MAX_RESULTS", "20"))
|
| 103 |
+
SOCIAL_MAX_RESULTS = int(os.getenv("SOCIAL_MAX_RESULTS", "30"))
|
| 104 |
+
BLOG_MAX_RESULTS = int(os.getenv("BLOG_MAX_RESULTS", "20"))
|
| 105 |
+
|
| 106 |
+
# External source controls
|
| 107 |
+
ENABLE_PATENTS_REAL = os.getenv("ENABLE_PATENTS_REAL", "true").lower() == "true"
|
| 108 |
+
ENABLE_STARTUPS_REAL = os.getenv("ENABLE_STARTUPS_REAL", "true").lower() == "true"
|
| 109 |
+
ENABLE_SOCIAL_REAL = os.getenv("ENABLE_SOCIAL_REAL", "true").lower() == "true"
|
| 110 |
+
ENABLE_BLOG_REAL = os.getenv("ENABLE_BLOG_REAL", "true").lower() == "true"
|
| 111 |
+
ALLOW_SIMULATED_SOURCES = os.getenv("ALLOW_SIMULATED_SOURCES", "true").lower() == "true"
|
| 112 |
+
|
| 113 |
+
# ─── Deduplication ────────────────────────────────────────────
|
| 114 |
+
DEDUP_SIMILARITY_THRESHOLD = 0.95
|
| 115 |
+
|
| 116 |
+
# ─── Server Settings ─────────────────────────────────────────
|
| 117 |
+
API_HOST = os.getenv("API_HOST", "0.0.0.0")
|
| 118 |
+
API_PORT = int(os.getenv("API_PORT", "8000"))
|
| 119 |
+
API_ADMIN_KEY = os.getenv("API_ADMIN_KEY", "")
|
| 120 |
+
# Allow any origin by default. Browsers enforce CORS, the Android client
|
| 121 |
+
# isn't a browser, and we don't use cookies for auth — so a permissive
|
| 122 |
+
# wildcard is fine and lets judges curl the API from anywhere.
|
| 123 |
+
CORS_ORIGINS = os.getenv("CORS_ORIGINS", "*").split(",")
|
| 124 |
+
|
| 125 |
+
# ─── Pipeline Generator ──────────────────────────────────────
|
| 126 |
+
SUPPORTED_TASK_CATEGORIES = [
|
| 127 |
+
"text-classification",
|
| 128 |
+
"image-classification",
|
| 129 |
+
"text-generation",
|
| 130 |
+
"question-answering",
|
| 131 |
+
"summarization",
|
| 132 |
+
]
|
data/pipeline_runs/147b5293-7c7b-4c13-85ff-b36a5208d85d/6c07740b-6b6f-4e74-9fec-7d72cee11a1c/pipeline.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import subprocess
|
| 3 |
+
os.environ.setdefault('VECTOR_MINDS_ARTIFACT_DIR', r'D:/zamzung/backend/data/pipeline_runs/147b5293-7c7b-4c13-85ff-b36a5208d85d/6c07740b-6b6f-4e74-9fec-7d72cee11a1c/artifacts')
|
| 4 |
+
|
| 5 |
+
print('pipeline-run-smoke')
|
data/pipeline_runs/147b5293-7c7b-4c13-85ff-b36a5208d85d/6c07740b-6b6f-4e74-9fec-7d72cee11a1c/run.log
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pipeline-run-smoke
|
data/pipeline_runs/1eb5fe5c-8c4b-4478-93fc-7b0502ebc54b/ceba6666-3bc5-42f3-b27a-60b8940793e5/pipeline.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import subprocess
|
| 3 |
+
os.environ.setdefault('VECTOR_MINDS_ARTIFACT_DIR', r'D:/zamzung/backend/data/pipeline_runs/1eb5fe5c-8c4b-4478-93fc-7b0502ebc54b/ceba6666-3bc5-42f3-b27a-60b8940793e5/artifacts')
|
| 4 |
+
|
| 5 |
+
print('pipeline-run-smoke')
|
data/pipeline_runs/1eb5fe5c-8c4b-4478-93fc-7b0502ebc54b/ceba6666-3bc5-42f3-b27a-60b8940793e5/run.log
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pipeline-run-smoke
|
data/pipeline_runs/494ff8ba-8b86-453a-8b40-750c5e34a634/d384b981-2582-466c-9524-8a101a5ec944/pipeline.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import subprocess
|
| 3 |
+
os.environ.setdefault('VECTOR_MINDS_ARTIFACT_DIR', r'D:/zamzung/backend/data/pipeline_runs/494ff8ba-8b86-453a-8b40-750c5e34a634/d384b981-2582-466c-9524-8a101a5ec944/artifacts')
|
| 4 |
+
|
| 5 |
+
print('pipeline-run-smoke')
|
data/pipeline_runs/494ff8ba-8b86-453a-8b40-750c5e34a634/d384b981-2582-466c-9524-8a101a5ec944/run.log
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pipeline-run-smoke
|
data/pipeline_runs/4a8279c0-16db-4713-bfca-b83f2b7a4e53/1d806c97-a4b8-4961-b1a3-bf3bf2e36996/pipeline.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import subprocess
|
| 3 |
+
os.environ.setdefault('VECTOR_MINDS_ARTIFACT_DIR', r'D:/zamzung/backend/data/pipeline_runs/4a8279c0-16db-4713-bfca-b83f2b7a4e53/1d806c97-a4b8-4961-b1a3-bf3bf2e36996/artifacts')
|
| 4 |
+
|
| 5 |
+
print('pipeline-run-smoke')
|
data/pipeline_runs/4a8279c0-16db-4713-bfca-b83f2b7a4e53/1d806c97-a4b8-4961-b1a3-bf3bf2e36996/run.log
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pipeline-run-smoke
|
data/pipeline_runs/83e14744-9a0a-4ece-829d-e9ad0dba3cbf/bda049c6-7392-4adf-853b-b59d66ade0fe/pipeline.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import subprocess
|
| 3 |
+
os.environ.setdefault('VECTOR_MINDS_ARTIFACT_DIR', r'D:/zamzung/backend/data/pipeline_runs/83e14744-9a0a-4ece-829d-e9ad0dba3cbf/bda049c6-7392-4adf-853b-b59d66ade0fe/artifacts')
|
| 4 |
+
|
| 5 |
+
print('pipeline-run-smoke')
|
data/pipeline_runs/83e14744-9a0a-4ece-829d-e9ad0dba3cbf/bda049c6-7392-4adf-853b-b59d66ade0fe/run.log
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pipeline-run-smoke
|
data/pipeline_runs/92ceb65d-565f-457c-a099-5f2fff00f093/39e0cfab-b5c4-4e03-ba91-9e3ce56a4482/pipeline.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ.setdefault('VECTOR_MINDS_ARTIFACT_DIR', r'D:/zamzung/backend/data/pipeline_runs/92ceb65d-565f-457c-a099-5f2fff00f093/39e0cfab-b5c4-4e03-ba91-9e3ce56a4482/artifacts')
|
| 3 |
+
|
| 4 |
+
print('pipeline-run-smoke')
|
data/pipeline_runs/92ceb65d-565f-457c-a099-5f2fff00f093/39e0cfab-b5c4-4e03-ba91-9e3ce56a4482/run.log
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pipeline-run-smoke
|
data/pipeline_runs/d30f2c12-4c47-40db-ba18-a7edd989515c/68664895-2b01-4fc4-9465-9413515b0d90/pipeline.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ.setdefault('VECTOR_MINDS_ARTIFACT_DIR', r'D:/zamzung/backend/data/pipeline_runs/d30f2c12-4c47-40db-ba18-a7edd989515c/68664895-2b01-4fc4-9465-9413515b0d90/artifacts')
|
| 3 |
+
|
| 4 |
+
# VectorMind Autonomous Pipeline
|
| 5 |
+
# Generated: 2026-05-08 06:09 UTC
|
| 6 |
+
# Technique: Prod Runtime Smoke
|
| 7 |
+
# Task: tabular-classification
|
| 8 |
+
# Dataset: ag_news (huggingface-fallback)
|
| 9 |
+
# Model: xgboost
|
| 10 |
+
|
| 11 |
+
!pip install -q transformers datasets accelerate optuna onnx safetensors evaluate torch pillow
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# --- Dataset resolution ---
|
| 15 |
+
DATASET_NAME = "ag_news"
|
| 16 |
+
DATASET_SOURCE = "huggingface-fallback"
|
| 17 |
+
MODEL_NAME = "xgboost"
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# --- Tabular baseline (executable sklearn pipeline) ---
|
| 21 |
+
import numpy as np
|
| 22 |
+
from sklearn.datasets import load_iris
|
| 23 |
+
from sklearn.model_selection import train_test_split
|
| 24 |
+
from sklearn.ensemble import HistGradientBoostingClassifier
|
| 25 |
+
from sklearn.metrics import accuracy_score
|
| 26 |
+
|
| 27 |
+
X, y = load_iris(return_X_y=True)
|
| 28 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
|
| 29 |
+
clf = HistGradientBoostingClassifier(max_iter=50, random_state=42)
|
| 30 |
+
clf.fit(X_train, y_train)
|
| 31 |
+
pred = clf.predict(X_test)
|
| 32 |
+
eval_metrics = {"eval_accuracy": float(accuracy_score(y_test, pred))}
|
| 33 |
+
print("tabular metrics:", eval_metrics)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# --- Bayesian Hyperparameter Optimization (Optuna) ---
|
| 37 |
+
import optuna
|
| 38 |
+
|
| 39 |
+
def objective(trial):
|
| 40 |
+
lr = trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True)
|
| 41 |
+
batch = trial.suggest_categorical("batch_size", [4, 8, 16])
|
| 42 |
+
base = float(eval_metrics.get("eval_accuracy", eval_metrics.get("eval_loss", 0.0)))
|
| 43 |
+
if "loss" in str(eval_metrics):
|
| 44 |
+
return base - batch / 10000.0
|
| 45 |
+
return base - batch / 1000.0
|
| 46 |
+
|
| 47 |
+
study = optuna.create_study(direction="maximize")
|
| 48 |
+
study.optimize(objective, n_trials=5)
|
| 49 |
+
print("Best hyperparameters:", study.best_params)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
# --- Multi-Format Export (SafeTensors; ONNX when compatible) ---
|
| 53 |
+
from pathlib import Path
|
| 54 |
+
import torch
|
| 55 |
+
|
| 56 |
+
output_path = Path("./vectormind_export")
|
| 57 |
+
output_path.mkdir(exist_ok=True)
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
model.save_pretrained(output_path, safe_serialization=True)
|
| 61 |
+
if "tokenizer" in dir():
|
| 62 |
+
tokenizer.save_pretrained(output_path)
|
| 63 |
+
except Exception as ex:
|
| 64 |
+
print("save_pretrained skip:", ex)
|
| 65 |
+
|
| 66 |
+
try:
|
| 67 |
+
dummy = {"input_ids": torch.ones(1, 8, dtype=torch.long), "attention_mask": torch.ones(1, 8, dtype=torch.long)}
|
| 68 |
+
if hasattr(model, "forward") and "input_ids" in dummy:
|
| 69 |
+
torch.onnx.export(
|
| 70 |
+
model,
|
| 71 |
+
(dummy["input_ids"], dummy["attention_mask"]),
|
| 72 |
+
output_path / "model.onnx",
|
| 73 |
+
input_names=["input_ids", "attention_mask"],
|
| 74 |
+
output_names=["logits"],
|
| 75 |
+
dynamic_axes={"input_ids": {0: "batch"}, "attention_mask": {0: "batch"}, "logits": {0: "batch"}},
|
| 76 |
+
opset_version=14,
|
| 77 |
+
)
|
| 78 |
+
except Exception as ex:
|
| 79 |
+
print("ONNX export skipped (model may be vision/audio — use native torch.jit or HF optimum):", ex)
|
| 80 |
+
|
| 81 |
+
try:
|
| 82 |
+
with open(output_path / "app.py", "w", encoding="utf-8") as f:
|
| 83 |
+
f.write("from fastapi import FastAPI\n")
|
| 84 |
+
f.write("app = FastAPI()\n")
|
| 85 |
+
f.write("@app.post('/predict')\n")
|
| 86 |
+
f.write("def predict(): return {'status': 'ok'}\n")
|
| 87 |
+
except Exception:
|
| 88 |
+
pass
|
| 89 |
+
|
| 90 |
+
print("Artifacts (partial):", output_path)
|
| 91 |
+
|
| 92 |
+
print('Pipeline generation for tabular-classification completed.')
|
data/pipeline_runs/d30f2c12-4c47-40db-ba18-a7edd989515c/68664895-2b01-4fc4-9465-9413515b0d90/run.log
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
File "D:\zamzung\backend\data\pipeline_runs\d30f2c12-4c47-40db-ba18-a7edd989515c\68664895-2b01-4fc4-9465-9413515b0d90\pipeline.py", line 11
|
| 2 |
+
!pip install -q transformers datasets accelerate optuna onnx safetensors evaluate torch pillow
|
| 3 |
+
^
|
| 4 |
+
SyntaxError: invalid syntax
|
data/pipeline_runs/eb542fc3-721b-4cdd-ac4c-0c54f62ca512/d255fb9e-b352-47a5-b055-77f1b70709dd/pipeline.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ.setdefault('VECTOR_MINDS_ARTIFACT_DIR', r'D:/zamzung/backend/data/pipeline_runs/eb542fc3-721b-4cdd-ac4c-0c54f62ca512/d255fb9e-b352-47a5-b055-77f1b70709dd/artifacts')
|
| 3 |
+
|
| 4 |
+
print('pipeline-run-smoke')
|
data/pipeline_runs/eb542fc3-721b-4cdd-ac4c-0c54f62ca512/d255fb9e-b352-47a5-b055-77f1b70709dd/run.log
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pipeline-run-smoke
|
db/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# VectorMinds Database Package
|
db/database.py
ADDED
|
@@ -0,0 +1,527 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Database — SQLite metadata storage layer.
|
| 2 |
+
|
| 3 |
+
Replaces PostgreSQL for the hackathon MVP. Stores structured metadata,
|
| 4 |
+
prediction records, user feedback, and agent state.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import logging
|
| 11 |
+
import sqlite3
|
| 12 |
+
from typing import Optional
|
| 13 |
+
|
| 14 |
+
import config
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger("vectorminds.database")
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
import psycopg2
|
| 20 |
+
except Exception: # pragma: no cover - optional dependency
|
| 21 |
+
psycopg2 = None
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class Database:
|
| 25 |
+
"""Metadata store for VectorMinds (SQLite or PostgreSQL)."""
|
| 26 |
+
|
| 27 |
+
_instance: Optional["Database"] = None
|
| 28 |
+
|
| 29 |
+
@classmethod
|
| 30 |
+
def get_instance(cls) -> "Database":
|
| 31 |
+
if cls._instance is None:
|
| 32 |
+
cls._instance = cls()
|
| 33 |
+
return cls._instance
|
| 34 |
+
|
| 35 |
+
def __init__(self):
|
| 36 |
+
self.db_path = str(config.DB_PATH)
|
| 37 |
+
self.backend = config.DB_BACKEND
|
| 38 |
+
self._conn: Optional[object] = None
|
| 39 |
+
|
| 40 |
+
def initialize(self):
|
| 41 |
+
"""Create database and tables."""
|
| 42 |
+
if self.backend == "postgres":
|
| 43 |
+
if psycopg2 is None:
|
| 44 |
+
raise RuntimeError(
|
| 45 |
+
"DB_BACKEND=postgres requires psycopg2-binary dependency."
|
| 46 |
+
)
|
| 47 |
+
self._conn = psycopg2.connect(config.POSTGRES_DSN)
|
| 48 |
+
self._conn.autocommit = False
|
| 49 |
+
else:
|
| 50 |
+
self._conn = sqlite3.connect(self.db_path, check_same_thread=False)
|
| 51 |
+
self._conn.row_factory = sqlite3.Row
|
| 52 |
+
self._create_tables()
|
| 53 |
+
logger.info(
|
| 54 |
+
"Database initialized using %s backend",
|
| 55 |
+
self.backend,
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
def _execute(self, query: str, params: tuple = ()):
|
| 59 |
+
cursor = self._conn.cursor()
|
| 60 |
+
cursor.execute(query, params)
|
| 61 |
+
return cursor
|
| 62 |
+
|
| 63 |
+
def _commit(self):
|
| 64 |
+
self._conn.commit()
|
| 65 |
+
|
| 66 |
+
def _placeholder(self) -> str:
|
| 67 |
+
return "%s" if self.backend == "postgres" else "?"
|
| 68 |
+
|
| 69 |
+
def _create_tables(self):
|
| 70 |
+
cursor = self._conn.cursor()
|
| 71 |
+
|
| 72 |
+
cursor.execute("""
|
| 73 |
+
CREATE TABLE IF NOT EXISTS research_signals (
|
| 74 |
+
id TEXT PRIMARY KEY,
|
| 75 |
+
source TEXT NOT NULL,
|
| 76 |
+
source_id TEXT,
|
| 77 |
+
title TEXT NOT NULL,
|
| 78 |
+
raw_text TEXT,
|
| 79 |
+
authors TEXT,
|
| 80 |
+
categories TEXT,
|
| 81 |
+
url TEXT,
|
| 82 |
+
novelty_score REAL DEFAULT 0,
|
| 83 |
+
impact_score REAL DEFAULT 0,
|
| 84 |
+
metadata TEXT,
|
| 85 |
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
| 86 |
+
)
|
| 87 |
+
""")
|
| 88 |
+
|
| 89 |
+
cursor.execute("""
|
| 90 |
+
CREATE TABLE IF NOT EXISTS trends (
|
| 91 |
+
id TEXT PRIMARY KEY,
|
| 92 |
+
rank INTEGER,
|
| 93 |
+
technique_name TEXT NOT NULL,
|
| 94 |
+
description TEXT,
|
| 95 |
+
emergence_score REAL DEFAULT 0,
|
| 96 |
+
novelty_score REAL DEFAULT 0,
|
| 97 |
+
impact_score REAL DEFAULT 0,
|
| 98 |
+
mainstream_eta_months INTEGER DEFAULT 12,
|
| 99 |
+
confidence REAL DEFAULT 0,
|
| 100 |
+
data TEXT,
|
| 101 |
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
| 102 |
+
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
|
| 103 |
+
)
|
| 104 |
+
""")
|
| 105 |
+
|
| 106 |
+
cursor.execute("""
|
| 107 |
+
CREATE TABLE IF NOT EXISTS blueprints (
|
| 108 |
+
id TEXT PRIMARY KEY,
|
| 109 |
+
technique_name TEXT NOT NULL,
|
| 110 |
+
trend_id TEXT,
|
| 111 |
+
data TEXT,
|
| 112 |
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
| 113 |
+
)
|
| 114 |
+
""")
|
| 115 |
+
|
| 116 |
+
cursor.execute("""
|
| 117 |
+
CREATE TABLE IF NOT EXISTS pipelines (
|
| 118 |
+
id TEXT PRIMARY KEY,
|
| 119 |
+
technique_name TEXT NOT NULL,
|
| 120 |
+
task_type TEXT,
|
| 121 |
+
status TEXT DEFAULT 'generated',
|
| 122 |
+
data TEXT,
|
| 123 |
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
| 124 |
+
)
|
| 125 |
+
""")
|
| 126 |
+
|
| 127 |
+
cursor.execute("""
|
| 128 |
+
CREATE TABLE IF NOT EXISTS pipeline_runs (
|
| 129 |
+
run_id TEXT PRIMARY KEY,
|
| 130 |
+
pipeline_id TEXT NOT NULL,
|
| 131 |
+
status TEXT NOT NULL,
|
| 132 |
+
data TEXT,
|
| 133 |
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
| 134 |
+
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
|
| 135 |
+
)
|
| 136 |
+
""")
|
| 137 |
+
|
| 138 |
+
cursor.execute("""
|
| 139 |
+
CREATE TABLE IF NOT EXISTS user_feedback (
|
| 140 |
+
id TEXT PRIMARY KEY,
|
| 141 |
+
target_id TEXT NOT NULL,
|
| 142 |
+
target_type TEXT NOT NULL,
|
| 143 |
+
action TEXT NOT NULL,
|
| 144 |
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
| 145 |
+
)
|
| 146 |
+
""")
|
| 147 |
+
|
| 148 |
+
cursor.execute("""
|
| 149 |
+
CREATE TABLE IF NOT EXISTS agent_state (
|
| 150 |
+
agent_name TEXT PRIMARY KEY,
|
| 151 |
+
state TEXT,
|
| 152 |
+
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
|
| 153 |
+
)
|
| 154 |
+
""")
|
| 155 |
+
|
| 156 |
+
cursor.execute("""
|
| 157 |
+
CREATE TABLE IF NOT EXISTS telegram_subscribers (
|
| 158 |
+
chat_id BIGINT PRIMARY KEY,
|
| 159 |
+
username TEXT DEFAULT '',
|
| 160 |
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
| 161 |
+
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
|
| 162 |
+
)
|
| 163 |
+
""")
|
| 164 |
+
|
| 165 |
+
self._commit()
|
| 166 |
+
|
| 167 |
+
def save_signal(self, signal_data: dict):
|
| 168 |
+
if self.backend == "postgres":
|
| 169 |
+
upsert = """
|
| 170 |
+
INSERT INTO research_signals
|
| 171 |
+
(id, source, source_id, title, raw_text, authors, categories, url,
|
| 172 |
+
novelty_score, impact_score, metadata)
|
| 173 |
+
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
| 174 |
+
ON CONFLICT (id) DO UPDATE SET
|
| 175 |
+
source = EXCLUDED.source,
|
| 176 |
+
source_id = EXCLUDED.source_id,
|
| 177 |
+
title = EXCLUDED.title,
|
| 178 |
+
raw_text = EXCLUDED.raw_text,
|
| 179 |
+
authors = EXCLUDED.authors,
|
| 180 |
+
categories = EXCLUDED.categories,
|
| 181 |
+
url = EXCLUDED.url,
|
| 182 |
+
novelty_score = EXCLUDED.novelty_score,
|
| 183 |
+
impact_score = EXCLUDED.impact_score,
|
| 184 |
+
metadata = EXCLUDED.metadata
|
| 185 |
+
"""
|
| 186 |
+
else:
|
| 187 |
+
upsert = """INSERT OR REPLACE INTO research_signals
|
| 188 |
+
(id, source, source_id, title, raw_text, authors, categories, url,
|
| 189 |
+
novelty_score, impact_score, metadata)
|
| 190 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
|
| 191 |
+
|
| 192 |
+
self._execute(
|
| 193 |
+
upsert,
|
| 194 |
+
(
|
| 195 |
+
signal_data.get("id", ""),
|
| 196 |
+
signal_data.get("source", ""),
|
| 197 |
+
signal_data.get("source_id", ""),
|
| 198 |
+
signal_data.get("title", ""),
|
| 199 |
+
signal_data.get("raw_text", ""),
|
| 200 |
+
json.dumps(signal_data.get("authors", [])),
|
| 201 |
+
json.dumps(signal_data.get("categories", [])),
|
| 202 |
+
signal_data.get("url", ""),
|
| 203 |
+
signal_data.get("novelty_score", 0),
|
| 204 |
+
signal_data.get("impact_score", 0),
|
| 205 |
+
json.dumps(signal_data.get("metadata", {})),
|
| 206 |
+
),
|
| 207 |
+
)
|
| 208 |
+
self._commit()
|
| 209 |
+
|
| 210 |
+
def save_trend(self, trend_data: dict):
|
| 211 |
+
if self.backend == "postgres":
|
| 212 |
+
upsert = """
|
| 213 |
+
INSERT INTO trends
|
| 214 |
+
(id, rank, technique_name, description, emergence_score,
|
| 215 |
+
novelty_score, impact_score, mainstream_eta_months, confidence, data)
|
| 216 |
+
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
| 217 |
+
ON CONFLICT (id) DO UPDATE SET
|
| 218 |
+
rank = EXCLUDED.rank,
|
| 219 |
+
technique_name = EXCLUDED.technique_name,
|
| 220 |
+
description = EXCLUDED.description,
|
| 221 |
+
emergence_score = EXCLUDED.emergence_score,
|
| 222 |
+
novelty_score = EXCLUDED.novelty_score,
|
| 223 |
+
impact_score = EXCLUDED.impact_score,
|
| 224 |
+
mainstream_eta_months = EXCLUDED.mainstream_eta_months,
|
| 225 |
+
confidence = EXCLUDED.confidence,
|
| 226 |
+
data = EXCLUDED.data,
|
| 227 |
+
updated_at = CURRENT_TIMESTAMP
|
| 228 |
+
"""
|
| 229 |
+
else:
|
| 230 |
+
upsert = """INSERT OR REPLACE INTO trends
|
| 231 |
+
(id, rank, technique_name, description, emergence_score,
|
| 232 |
+
novelty_score, impact_score, mainstream_eta_months, confidence, data)
|
| 233 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
|
| 234 |
+
|
| 235 |
+
self._execute(
|
| 236 |
+
upsert,
|
| 237 |
+
(
|
| 238 |
+
trend_data.get("id", ""),
|
| 239 |
+
trend_data.get("rank", 0),
|
| 240 |
+
trend_data.get("technique_name", ""),
|
| 241 |
+
trend_data.get("description", ""),
|
| 242 |
+
trend_data.get("emergence_score", 0),
|
| 243 |
+
trend_data.get("novelty_score", 0),
|
| 244 |
+
trend_data.get("impact_score", 0),
|
| 245 |
+
trend_data.get("mainstream_eta_months", 12),
|
| 246 |
+
trend_data.get("confidence", 0),
|
| 247 |
+
json.dumps(trend_data),
|
| 248 |
+
),
|
| 249 |
+
)
|
| 250 |
+
self._commit()
|
| 251 |
+
|
| 252 |
+
def get_signals_count(self) -> int:
|
| 253 |
+
cursor = self._execute("SELECT COUNT(*) FROM research_signals")
|
| 254 |
+
return cursor.fetchone()[0]
|
| 255 |
+
|
| 256 |
+
def get_signals_by_source(self, source: str) -> int:
|
| 257 |
+
ph = self._placeholder()
|
| 258 |
+
cursor = self._execute(
|
| 259 |
+
f"SELECT COUNT(*) FROM research_signals WHERE source = {ph}",
|
| 260 |
+
(source,),
|
| 261 |
+
)
|
| 262 |
+
return cursor.fetchone()[0]
|
| 263 |
+
|
| 264 |
+
def save_feedback(self, feedback_data: dict):
|
| 265 |
+
ph = self._placeholder()
|
| 266 |
+
self._execute(
|
| 267 |
+
f"""INSERT INTO user_feedback (id, target_id, target_type, action)
|
| 268 |
+
VALUES ({ph}, {ph}, {ph}, {ph})""",
|
| 269 |
+
(
|
| 270 |
+
feedback_data.get("id", ""),
|
| 271 |
+
feedback_data.get("target_id", ""),
|
| 272 |
+
feedback_data.get("target_type", ""),
|
| 273 |
+
feedback_data.get("action", ""),
|
| 274 |
+
),
|
| 275 |
+
)
|
| 276 |
+
self._commit()
|
| 277 |
+
|
| 278 |
+
def save_pipeline(self, pipeline_data: dict):
|
| 279 |
+
"""Persist generated/updated pipeline snapshot."""
|
| 280 |
+
if self.backend == "postgres":
|
| 281 |
+
upsert = """
|
| 282 |
+
INSERT INTO pipelines (id, technique_name, task_type, status, data)
|
| 283 |
+
VALUES (%s, %s, %s, %s, %s)
|
| 284 |
+
ON CONFLICT (id) DO UPDATE SET
|
| 285 |
+
technique_name = EXCLUDED.technique_name,
|
| 286 |
+
task_type = EXCLUDED.task_type,
|
| 287 |
+
status = EXCLUDED.status,
|
| 288 |
+
data = EXCLUDED.data
|
| 289 |
+
"""
|
| 290 |
+
else:
|
| 291 |
+
upsert = """INSERT OR REPLACE INTO pipelines
|
| 292 |
+
(id, technique_name, task_type, status, data)
|
| 293 |
+
VALUES (?, ?, ?, ?, ?)"""
|
| 294 |
+
self._execute(
|
| 295 |
+
upsert,
|
| 296 |
+
(
|
| 297 |
+
pipeline_data.get("id", ""),
|
| 298 |
+
pipeline_data.get("technique_name", ""),
|
| 299 |
+
pipeline_data.get("task_type", ""),
|
| 300 |
+
pipeline_data.get("status", "generated"),
|
| 301 |
+
json.dumps(pipeline_data),
|
| 302 |
+
),
|
| 303 |
+
)
|
| 304 |
+
self._commit()
|
| 305 |
+
|
| 306 |
+
def save_pipeline_run(self, run_data: dict):
|
| 307 |
+
"""Persist pipeline run state."""
|
| 308 |
+
if self.backend == "postgres":
|
| 309 |
+
upsert = """
|
| 310 |
+
INSERT INTO pipeline_runs (run_id, pipeline_id, status, data)
|
| 311 |
+
VALUES (%s, %s, %s, %s)
|
| 312 |
+
ON CONFLICT (run_id) DO UPDATE SET
|
| 313 |
+
pipeline_id = EXCLUDED.pipeline_id,
|
| 314 |
+
status = EXCLUDED.status,
|
| 315 |
+
data = EXCLUDED.data,
|
| 316 |
+
updated_at = CURRENT_TIMESTAMP
|
| 317 |
+
"""
|
| 318 |
+
else:
|
| 319 |
+
upsert = """INSERT OR REPLACE INTO pipeline_runs
|
| 320 |
+
(run_id, pipeline_id, status, data)
|
| 321 |
+
VALUES (?, ?, ?, ?)"""
|
| 322 |
+
self._execute(
|
| 323 |
+
upsert,
|
| 324 |
+
(
|
| 325 |
+
run_data.get("run_id", ""),
|
| 326 |
+
run_data.get("pipeline_id", ""),
|
| 327 |
+
run_data.get("status", "queued"),
|
| 328 |
+
json.dumps(run_data),
|
| 329 |
+
),
|
| 330 |
+
)
|
| 331 |
+
self._commit()
|
| 332 |
+
|
| 333 |
+
def get_pipeline_runs(self, pipeline_id: str) -> list[dict]:
|
| 334 |
+
"""Fetch all runs for a pipeline, newest first."""
|
| 335 |
+
ph = self._placeholder()
|
| 336 |
+
cursor = self._execute(
|
| 337 |
+
f"SELECT data FROM pipeline_runs WHERE pipeline_id = {ph} ORDER BY created_at DESC",
|
| 338 |
+
(pipeline_id,),
|
| 339 |
+
)
|
| 340 |
+
rows = cursor.fetchall()
|
| 341 |
+
out = []
|
| 342 |
+
for row in rows:
|
| 343 |
+
payload = row["data"] if isinstance(row, sqlite3.Row) else row[0]
|
| 344 |
+
out.append(json.loads(payload))
|
| 345 |
+
return out
|
| 346 |
+
|
| 347 |
+
def get_pipeline_run(self, run_id: str) -> Optional[dict]:
|
| 348 |
+
"""Fetch one pipeline run by id."""
|
| 349 |
+
ph = self._placeholder()
|
| 350 |
+
cursor = self._execute(
|
| 351 |
+
f"SELECT data FROM pipeline_runs WHERE run_id = {ph}",
|
| 352 |
+
(run_id,),
|
| 353 |
+
)
|
| 354 |
+
row = cursor.fetchone()
|
| 355 |
+
if not row:
|
| 356 |
+
return None
|
| 357 |
+
payload = row["data"] if isinstance(row, sqlite3.Row) else row[0]
|
| 358 |
+
return json.loads(payload)
|
| 359 |
+
|
| 360 |
+
# ── Blueprints ──────────────────────────────────────────
|
| 361 |
+
|
| 362 |
+
def save_blueprint(self, blueprint_data: dict):
|
| 363 |
+
"""Persist a generated blueprint."""
|
| 364 |
+
if self.backend == "postgres":
|
| 365 |
+
upsert = """
|
| 366 |
+
INSERT INTO blueprints (id, technique_name, trend_id, data)
|
| 367 |
+
VALUES (%s, %s, %s, %s)
|
| 368 |
+
ON CONFLICT (id) DO UPDATE SET
|
| 369 |
+
technique_name = EXCLUDED.technique_name,
|
| 370 |
+
trend_id = EXCLUDED.trend_id,
|
| 371 |
+
data = EXCLUDED.data
|
| 372 |
+
"""
|
| 373 |
+
else:
|
| 374 |
+
upsert = """INSERT OR REPLACE INTO blueprints
|
| 375 |
+
(id, technique_name, trend_id, data)
|
| 376 |
+
VALUES (?, ?, ?, ?)"""
|
| 377 |
+
self._execute(
|
| 378 |
+
upsert,
|
| 379 |
+
(
|
| 380 |
+
blueprint_data.get("id", ""),
|
| 381 |
+
blueprint_data.get("technique_name", ""),
|
| 382 |
+
blueprint_data.get("trend_id", ""),
|
| 383 |
+
json.dumps(blueprint_data),
|
| 384 |
+
),
|
| 385 |
+
)
|
| 386 |
+
self._commit()
|
| 387 |
+
|
| 388 |
+
def list_blueprints(self) -> list[dict]:
|
| 389 |
+
cursor = self._execute(
|
| 390 |
+
"SELECT data FROM blueprints ORDER BY created_at DESC"
|
| 391 |
+
)
|
| 392 |
+
rows = cursor.fetchall()
|
| 393 |
+
out = []
|
| 394 |
+
for row in rows:
|
| 395 |
+
payload = row["data"] if isinstance(row, sqlite3.Row) else row[0]
|
| 396 |
+
try:
|
| 397 |
+
out.append(json.loads(payload))
|
| 398 |
+
except Exception:
|
| 399 |
+
continue
|
| 400 |
+
return out
|
| 401 |
+
|
| 402 |
+
# ── Signals & Trends listing (for startup hydration) ────
|
| 403 |
+
|
| 404 |
+
def list_signals(self, limit: int = 1000) -> list[dict]:
|
| 405 |
+
"""Return persisted research signals as plain dicts (no embedding).
|
| 406 |
+
|
| 407 |
+
Used at startup to rehydrate the in-memory vector store. Embeddings
|
| 408 |
+
are recomputed from ``raw_text`` because the column doesn't store
|
| 409 |
+
them.
|
| 410 |
+
"""
|
| 411 |
+
ph = self._placeholder()
|
| 412 |
+
cursor = self._execute(
|
| 413 |
+
f"""SELECT id, source, source_id, title, raw_text, authors,
|
| 414 |
+
categories, url, novelty_score, impact_score, metadata
|
| 415 |
+
FROM research_signals
|
| 416 |
+
ORDER BY created_at DESC
|
| 417 |
+
LIMIT {ph}""",
|
| 418 |
+
(int(limit),),
|
| 419 |
+
)
|
| 420 |
+
rows = cursor.fetchall()
|
| 421 |
+
out: list[dict] = []
|
| 422 |
+
for row in rows:
|
| 423 |
+
if isinstance(row, sqlite3.Row):
|
| 424 |
+
d = dict(row)
|
| 425 |
+
else:
|
| 426 |
+
cols = [
|
| 427 |
+
"id", "source", "source_id", "title", "raw_text", "authors",
|
| 428 |
+
"categories", "url", "novelty_score", "impact_score", "metadata",
|
| 429 |
+
]
|
| 430 |
+
d = dict(zip(cols, row))
|
| 431 |
+
try:
|
| 432 |
+
d["authors"] = json.loads(d.get("authors") or "[]")
|
| 433 |
+
except Exception:
|
| 434 |
+
d["authors"] = []
|
| 435 |
+
try:
|
| 436 |
+
d["categories"] = json.loads(d.get("categories") or "[]")
|
| 437 |
+
except Exception:
|
| 438 |
+
d["categories"] = []
|
| 439 |
+
try:
|
| 440 |
+
d["metadata"] = json.loads(d.get("metadata") or "{}")
|
| 441 |
+
except Exception:
|
| 442 |
+
d["metadata"] = {}
|
| 443 |
+
out.append(d)
|
| 444 |
+
return out
|
| 445 |
+
|
| 446 |
+
def list_trends(self) -> list[dict]:
|
| 447 |
+
"""Return persisted trends. Each row stores the full trend JSON in ``data``."""
|
| 448 |
+
cursor = self._execute(
|
| 449 |
+
"SELECT data FROM trends ORDER BY rank ASC"
|
| 450 |
+
)
|
| 451 |
+
rows = cursor.fetchall()
|
| 452 |
+
out: list[dict] = []
|
| 453 |
+
for row in rows:
|
| 454 |
+
payload = row["data"] if isinstance(row, sqlite3.Row) else row[0]
|
| 455 |
+
try:
|
| 456 |
+
out.append(json.loads(payload))
|
| 457 |
+
except Exception:
|
| 458 |
+
continue
|
| 459 |
+
return out
|
| 460 |
+
|
| 461 |
+
def list_pipelines(self) -> list[dict]:
|
| 462 |
+
cursor = self._execute(
|
| 463 |
+
"SELECT data FROM pipelines ORDER BY created_at DESC"
|
| 464 |
+
)
|
| 465 |
+
rows = cursor.fetchall()
|
| 466 |
+
out = []
|
| 467 |
+
for row in rows:
|
| 468 |
+
payload = row["data"] if isinstance(row, sqlite3.Row) else row[0]
|
| 469 |
+
try:
|
| 470 |
+
out.append(json.loads(payload))
|
| 471 |
+
except Exception:
|
| 472 |
+
continue
|
| 473 |
+
return out
|
| 474 |
+
|
| 475 |
+
# ── Telegram subscribers ────────────────────────────────
|
| 476 |
+
|
| 477 |
+
def ensure_telegram_subscribers_table(self):
|
| 478 |
+
"""Idempotent ensure of the subscribers table for older deployments."""
|
| 479 |
+
cursor = self._conn.cursor()
|
| 480 |
+
cursor.execute("""
|
| 481 |
+
CREATE TABLE IF NOT EXISTS telegram_subscribers (
|
| 482 |
+
chat_id BIGINT PRIMARY KEY,
|
| 483 |
+
username TEXT DEFAULT '',
|
| 484 |
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
| 485 |
+
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
|
| 486 |
+
)
|
| 487 |
+
""")
|
| 488 |
+
self._commit()
|
| 489 |
+
|
| 490 |
+
def upsert_telegram_subscriber(self, chat_id: int, username: str = ""):
|
| 491 |
+
if self.backend == "postgres":
|
| 492 |
+
sql = """
|
| 493 |
+
INSERT INTO telegram_subscribers (chat_id, username)
|
| 494 |
+
VALUES (%s, %s)
|
| 495 |
+
ON CONFLICT (chat_id) DO UPDATE SET
|
| 496 |
+
username = EXCLUDED.username,
|
| 497 |
+
updated_at = CURRENT_TIMESTAMP
|
| 498 |
+
"""
|
| 499 |
+
else:
|
| 500 |
+
sql = """INSERT OR REPLACE INTO telegram_subscribers (chat_id, username)
|
| 501 |
+
VALUES (?, ?)"""
|
| 502 |
+
self._execute(sql, (int(chat_id), username or ""))
|
| 503 |
+
self._commit()
|
| 504 |
+
|
| 505 |
+
def delete_telegram_subscriber(self, chat_id: int):
|
| 506 |
+
ph = self._placeholder()
|
| 507 |
+
self._execute(
|
| 508 |
+
f"DELETE FROM telegram_subscribers WHERE chat_id = {ph}",
|
| 509 |
+
(int(chat_id),),
|
| 510 |
+
)
|
| 511 |
+
self._commit()
|
| 512 |
+
|
| 513 |
+
def list_telegram_subscriber_ids(self) -> list[int]:
|
| 514 |
+
cursor = self._execute("SELECT chat_id FROM telegram_subscribers ORDER BY created_at ASC")
|
| 515 |
+
rows = cursor.fetchall()
|
| 516 |
+
out = []
|
| 517 |
+
for row in rows:
|
| 518 |
+
cid = row["chat_id"] if isinstance(row, sqlite3.Row) else row[0]
|
| 519 |
+
try:
|
| 520 |
+
out.append(int(cid))
|
| 521 |
+
except Exception:
|
| 522 |
+
continue
|
| 523 |
+
return out
|
| 524 |
+
|
| 525 |
+
def close(self):
|
| 526 |
+
if self._conn:
|
| 527 |
+
self._conn.close()
|
delivery/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# VectorMinds Delivery Package
|
delivery/api_routes.py
ADDED
|
@@ -0,0 +1,781 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""REST API Routes — FastAPI endpoints for VectorMinds.
|
| 2 |
+
|
| 3 |
+
Provides full programmatic access to all platform capabilities:
|
| 4 |
+
trends, blueprints, pipelines, ingestion, stats, and vector map.
|
| 5 |
+
Includes WebSocket for real-time dashboard updates.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import asyncio
|
| 11 |
+
import json
|
| 12 |
+
import logging
|
| 13 |
+
from datetime import datetime, timezone
|
| 14 |
+
from typing import Optional
|
| 15 |
+
|
| 16 |
+
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Query, HTTPException, Header
|
| 17 |
+
from pydantic import BaseModel
|
| 18 |
+
|
| 19 |
+
from agents.ingestion_agent import IngestionAgent
|
| 20 |
+
from agents.reasoning_agent import ReasoningAgent
|
| 21 |
+
from agents.memory_agent import MemoryAgent
|
| 22 |
+
from embeddings.engine import EmbeddingEngine
|
| 23 |
+
from embeddings.vector_store import VectorStore
|
| 24 |
+
from intelligence.blueprint_engine import BlueprintEngine
|
| 25 |
+
from intelligence.experiment_designer import ExperimentDesigner
|
| 26 |
+
from intelligence.pipeline_generator import PipelineGenerator
|
| 27 |
+
from intelligence.pipeline_executor import PipelineExecutor
|
| 28 |
+
from delivery.telegram_bot import TelegramBot
|
| 29 |
+
from db.database import Database
|
| 30 |
+
import config
|
| 31 |
+
|
| 32 |
+
logger = logging.getLogger("vectorminds.api")
|
| 33 |
+
|
| 34 |
+
router = APIRouter(prefix="/api")
|
| 35 |
+
|
| 36 |
+
# ─── Global instances (set by main.py on startup) ────────────
|
| 37 |
+
ingestion_agent: Optional[IngestionAgent] = None
|
| 38 |
+
reasoning_agent: Optional[ReasoningAgent] = None
|
| 39 |
+
memory_agent: Optional[MemoryAgent] = None
|
| 40 |
+
blueprint_engine: Optional[BlueprintEngine] = None
|
| 41 |
+
pipeline_generator: Optional[PipelineGenerator] = None
|
| 42 |
+
pipeline_executor: Optional[PipelineExecutor] = None
|
| 43 |
+
experiment_designer: Optional[ExperimentDesigner] = None
|
| 44 |
+
telegram_bot: Optional[TelegramBot] = None
|
| 45 |
+
embedding_engine: Optional[EmbeddingEngine] = None
|
| 46 |
+
vector_store: Optional[VectorStore] = None
|
| 47 |
+
database: Optional[Database] = None
|
| 48 |
+
|
| 49 |
+
# WebSocket connections for live updates
|
| 50 |
+
ws_connections: list[WebSocket] = []
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# ─── Request / Response Models ────────────────────────────────
|
| 54 |
+
class IngestRequest(BaseModel):
|
| 55 |
+
source: str = "all" # 'arxiv', 'github', or 'all'
|
| 56 |
+
category: Optional[str] = None # e.g. 'cs.LG'
|
| 57 |
+
# When True, kick the ingestion run off in the background and return
|
| 58 |
+
# immediately with status="started". Lets mobile clients show a banner
|
| 59 |
+
# without holding open a 60–120s HTTP request that arXiv often pushes
|
| 60 |
+
# past their OkHttp read timeout.
|
| 61 |
+
background: bool = False
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# In-memory tracker for the most recent ingestion run. Keeps the UI honest
|
| 65 |
+
# about what's happening on the server even when the call is fire-and-forget.
|
| 66 |
+
_ingestion_status: dict = {
|
| 67 |
+
"state": "idle", # 'idle' | 'running' | 'completed' | 'failed'
|
| 68 |
+
"started_at": None,
|
| 69 |
+
"finished_at": None,
|
| 70 |
+
"signals_ingested": 0,
|
| 71 |
+
"trends_updated": 0,
|
| 72 |
+
"error": None,
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class BlueprintRequest(BaseModel):
|
| 77 |
+
trend_id: str
|
| 78 |
+
additional_context: str = ""
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class PipelineRequest(BaseModel):
|
| 82 |
+
technique_name: str
|
| 83 |
+
description: str = ""
|
| 84 |
+
task_type: Optional[str] = None
|
| 85 |
+
|
| 86 |
+
class PipelineDatasetCandidatesRequest(BaseModel):
|
| 87 |
+
technique_name: str
|
| 88 |
+
description: str = ""
|
| 89 |
+
task_type: Optional[str] = None
|
| 90 |
+
top_k: int = 8
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
class PipelineRunRequest(BaseModel):
|
| 94 |
+
timeout_seconds: int = config.PIPELINE_RUN_TIMEOUT_SECONDS
|
| 95 |
+
wait_for_completion: bool = False
|
| 96 |
+
|
| 97 |
+
class ExperimentDesignRequest(BaseModel):
|
| 98 |
+
technique_name: str
|
| 99 |
+
brief: str = ""
|
| 100 |
+
|
| 101 |
+
class DashboardPremiumContextResponse(BaseModel):
|
| 102 |
+
location: str
|
| 103 |
+
focus: str
|
| 104 |
+
next_meeting: str
|
| 105 |
+
author_name: str
|
| 106 |
+
papers_count: int
|
| 107 |
+
confidence: float
|
| 108 |
+
reasoning_points: list[str]
|
| 109 |
+
source_modes: dict
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
class FeedbackRequest(BaseModel):
|
| 113 |
+
target_id: str
|
| 114 |
+
target_type: str = "trend" # 'trend' or 'blueprint'
|
| 115 |
+
action: str = "upvote" # 'upvote' or 'downvote'
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
class SearchRequest(BaseModel):
|
| 119 |
+
query: str
|
| 120 |
+
top_k: int = 10
|
| 121 |
+
source_filter: Optional[str] = None
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def _assert_admin_api_key(x_api_key: Optional[str]):
|
| 125 |
+
if not config.API_ADMIN_KEY:
|
| 126 |
+
return
|
| 127 |
+
if x_api_key != config.API_ADMIN_KEY:
|
| 128 |
+
raise HTTPException(status_code=401, detail="Unauthorized")
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def _apply_run_snapshot_to_pipeline(pipeline, run: dict):
|
| 132 |
+
if run["status"] in ("completed", "failed", "timeout"):
|
| 133 |
+
pipeline.status = run["status"]
|
| 134 |
+
else:
|
| 135 |
+
pipeline.status = "training"
|
| 136 |
+
metrics = dict(pipeline.metrics or {})
|
| 137 |
+
metrics["last_run"] = {
|
| 138 |
+
"run_id": run["run_id"],
|
| 139 |
+
"status": run["status"],
|
| 140 |
+
"started_at": run.get("started_at"),
|
| 141 |
+
"finished_at": run.get("finished_at"),
|
| 142 |
+
"exit_code": run.get("exit_code"),
|
| 143 |
+
"duration_seconds": run.get("duration_seconds"),
|
| 144 |
+
"log_path": run.get("log_path"),
|
| 145 |
+
"artifacts_dir": run.get("artifacts_dir"),
|
| 146 |
+
"retry_count": run.get("retry_count", 0),
|
| 147 |
+
"max_retries": run.get("max_retries", 0),
|
| 148 |
+
}
|
| 149 |
+
pipeline.metrics = metrics
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
# ─── Broadcast helper ────────────────────────────────────────
|
| 153 |
+
async def broadcast_ws(event_type: str, data: dict):
|
| 154 |
+
"""Send real-time update to all connected WebSocket clients."""
|
| 155 |
+
message = json.dumps({"type": event_type, "data": data, "timestamp": datetime.now(timezone.utc).isoformat()})
|
| 156 |
+
disconnected = []
|
| 157 |
+
for ws in ws_connections:
|
| 158 |
+
try:
|
| 159 |
+
await ws.send_text(message)
|
| 160 |
+
except Exception:
|
| 161 |
+
disconnected.append(ws)
|
| 162 |
+
for ws in disconnected:
|
| 163 |
+
ws_connections.remove(ws)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
# ─── Endpoints ────────────────────────────────────────────────
|
| 167 |
+
|
| 168 |
+
@router.get("/health")
|
| 169 |
+
async def health_check():
|
| 170 |
+
"""Platform health check."""
|
| 171 |
+
agents_health = {}
|
| 172 |
+
if ingestion_agent:
|
| 173 |
+
agents_health["ingestion"] = ingestion_agent.get_health()
|
| 174 |
+
if reasoning_agent:
|
| 175 |
+
agents_health["reasoning"] = reasoning_agent.get_health()
|
| 176 |
+
if memory_agent:
|
| 177 |
+
agents_health["memory"] = memory_agent.get_health()
|
| 178 |
+
|
| 179 |
+
return {
|
| 180 |
+
"status": "healthy",
|
| 181 |
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
| 182 |
+
"agents": agents_health,
|
| 183 |
+
"vector_store_count": vector_store.get_collection_count() if vector_store else 0,
|
| 184 |
+
"infra": {
|
| 185 |
+
"event_bus_backend": config.MESSAGE_BUS_BACKEND,
|
| 186 |
+
"state_store_backend": config.STATE_STORE_BACKEND,
|
| 187 |
+
"db_backend": config.DB_BACKEND,
|
| 188 |
+
"vector_store_backend": "qdrant_in_memory" if not config.QDRANT_HOST else "qdrant_remote",
|
| 189 |
+
},
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
@router.get("/stats")
|
| 194 |
+
async def get_stats():
|
| 195 |
+
"""Get platform statistics for dashboard."""
|
| 196 |
+
vs_count = vector_store.get_collection_count() if vector_store else 0
|
| 197 |
+
db_papers = database.get_signals_by_source("arxiv") if database else 0
|
| 198 |
+
db_repos = database.get_signals_by_source("github") if database else 0
|
| 199 |
+
|
| 200 |
+
# Get novelty distribution from stored signals
|
| 201 |
+
payloads = vector_store.get_all_payloads(limit=200) if vector_store else []
|
| 202 |
+
novelty_scores = [p.get("novelty_score", 0) for p in payloads]
|
| 203 |
+
|
| 204 |
+
return {
|
| 205 |
+
"total_signals": vs_count,
|
| 206 |
+
"total_papers": db_papers,
|
| 207 |
+
"total_github_repos": db_repos,
|
| 208 |
+
"active_trends": len(reasoning_agent.trends) if reasoning_agent else 0,
|
| 209 |
+
"blueprints_generated": len(blueprint_engine.generated_blueprints) if blueprint_engine else 0,
|
| 210 |
+
"pipelines_launched": len(pipeline_generator.generated_pipelines) if pipeline_generator else 0,
|
| 211 |
+
"avg_novelty_score": round(sum(novelty_scores) / max(len(novelty_scores), 1), 3),
|
| 212 |
+
"novelty_distribution": novelty_scores[:100],
|
| 213 |
+
"agents_status": {
|
| 214 |
+
"ingestion": ingestion_agent.status if ingestion_agent else "offline",
|
| 215 |
+
"reasoning": reasoning_agent.status if reasoning_agent else "offline",
|
| 216 |
+
"memory": memory_agent.status if memory_agent else "offline",
|
| 217 |
+
},
|
| 218 |
+
"source_modes": {
|
| 219 |
+
"patents_real": config.ENABLE_PATENTS_REAL,
|
| 220 |
+
"startups_real": config.ENABLE_STARTUPS_REAL,
|
| 221 |
+
"social_real": config.ENABLE_SOCIAL_REAL,
|
| 222 |
+
"blog_real": config.ENABLE_BLOG_REAL,
|
| 223 |
+
"allow_simulated_fallback": config.ALLOW_SIMULATED_SOURCES,
|
| 224 |
+
},
|
| 225 |
+
"telegram": telegram_bot.get_stats() if telegram_bot else {},
|
| 226 |
+
"last_updated": datetime.now(timezone.utc).isoformat(),
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
@router.get("/dashboard/premium-context", response_model=DashboardPremiumContextResponse)
|
| 230 |
+
async def get_dashboard_premium_context():
|
| 231 |
+
"""Backend-derived context for premium dashboard panels."""
|
| 232 |
+
trends = []
|
| 233 |
+
if reasoning_agent and reasoning_agent.trends:
|
| 234 |
+
trends = sorted(
|
| 235 |
+
reasoning_agent.trends.values(),
|
| 236 |
+
key=lambda t: t.emergence_score,
|
| 237 |
+
reverse=True,
|
| 238 |
+
)
|
| 239 |
+
elif reasoning_agent:
|
| 240 |
+
trends = await reasoning_agent.analyze_trends()
|
| 241 |
+
|
| 242 |
+
top = trends[0] if trends else None
|
| 243 |
+
technique = top.technique_name if top else "Autonomous Research Discovery"
|
| 244 |
+
papers = top.paper_count if top else 0
|
| 245 |
+
confidence = float(top.confidence if top else 0.74)
|
| 246 |
+
|
| 247 |
+
location = (
|
| 248 |
+
"Distributed Lab (Cloud + Device)"
|
| 249 |
+
if config.STATE_STORE_BACKEND == "redis"
|
| 250 |
+
else "Local Research Runtime"
|
| 251 |
+
)
|
| 252 |
+
next_meeting = f"Trend Review: {technique} ({'6' if top and top.mainstream_eta_months <= 6 else '12/24'} month horizon)"
|
| 253 |
+
reasoning_points = [
|
| 254 |
+
f"Top ranked technique is '{technique}' from live trend analysis",
|
| 255 |
+
f"Cross-source evidence includes {papers} papers and {top.github_stars if top else 0} GitHub stars",
|
| 256 |
+
f"Current backend mode: DB={config.DB_BACKEND}, Bus={config.MESSAGE_BUS_BACKEND}, State={config.STATE_STORE_BACKEND}",
|
| 257 |
+
]
|
| 258 |
+
source_modes = {
|
| 259 |
+
"patents_real": config.ENABLE_PATENTS_REAL,
|
| 260 |
+
"startups_real": config.ENABLE_STARTUPS_REAL,
|
| 261 |
+
"social_real": config.ENABLE_SOCIAL_REAL,
|
| 262 |
+
"blog_real": config.ENABLE_BLOG_REAL,
|
| 263 |
+
"allow_simulated_fallback": config.ALLOW_SIMULATED_SOURCES,
|
| 264 |
+
}
|
| 265 |
+
return DashboardPremiumContextResponse(
|
| 266 |
+
location=location,
|
| 267 |
+
focus=technique,
|
| 268 |
+
next_meeting=next_meeting,
|
| 269 |
+
author_name="Top Signal Cluster",
|
| 270 |
+
papers_count=papers,
|
| 271 |
+
confidence=confidence,
|
| 272 |
+
reasoning_points=reasoning_points,
|
| 273 |
+
source_modes=source_modes,
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
async def _run_ingestion_pipeline(req: IngestRequest) -> dict:
|
| 278 |
+
"""Execute the full ingest → analyze → broadcast pipeline.
|
| 279 |
+
|
| 280 |
+
Returns a result dict regardless of whether it was awaited inline or
|
| 281 |
+
scheduled as a background task. Updates the shared `_ingestion_status`
|
| 282 |
+
so polling clients can observe progress.
|
| 283 |
+
"""
|
| 284 |
+
global _ingestion_status
|
| 285 |
+
_ingestion_status = {
|
| 286 |
+
"state": "running",
|
| 287 |
+
"started_at": datetime.now(timezone.utc).isoformat(),
|
| 288 |
+
"finished_at": None,
|
| 289 |
+
"signals_ingested": 0,
|
| 290 |
+
"trends_updated": 0,
|
| 291 |
+
"error": None,
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
try:
|
| 295 |
+
signals = await ingestion_agent.run_ingestion(
|
| 296 |
+
source=req.source, category=req.category
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
if database:
|
| 300 |
+
for s in signals:
|
| 301 |
+
database.save_signal(s.model_dump(mode="json"))
|
| 302 |
+
|
| 303 |
+
trends_count = 0
|
| 304 |
+
if reasoning_agent:
|
| 305 |
+
trends = await reasoning_agent.analyze_trends()
|
| 306 |
+
trends_count = len(reasoning_agent.trends)
|
| 307 |
+
if database:
|
| 308 |
+
for t in trends:
|
| 309 |
+
database.save_trend(t.model_dump(mode="json"))
|
| 310 |
+
if telegram_bot:
|
| 311 |
+
for t in trends[:3]:
|
| 312 |
+
if t.impact_score >= config.IMPACT_HIGH_THRESHOLD:
|
| 313 |
+
await telegram_bot.send_trend_alert(
|
| 314 |
+
technique=t.technique_name,
|
| 315 |
+
score=t.emergence_score,
|
| 316 |
+
eta=t.mainstream_eta_months,
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
await broadcast_ws("ingestion_complete", {
|
| 320 |
+
"count": len(signals),
|
| 321 |
+
"source": req.source,
|
| 322 |
+
"signals": [
|
| 323 |
+
{
|
| 324 |
+
"id": s.id,
|
| 325 |
+
"source": s.source.value,
|
| 326 |
+
"title": s.title,
|
| 327 |
+
"novelty_score": s.novelty_score,
|
| 328 |
+
"url": s.url,
|
| 329 |
+
}
|
| 330 |
+
for s in signals[:20]
|
| 331 |
+
],
|
| 332 |
+
})
|
| 333 |
+
|
| 334 |
+
if telegram_bot:
|
| 335 |
+
arxiv_count = sum(1 for s in signals if s.source.value == "arxiv")
|
| 336 |
+
github_count = sum(1 for s in signals if s.source.value == "github")
|
| 337 |
+
await telegram_bot.send_ingestion_summary(arxiv_count, github_count)
|
| 338 |
+
|
| 339 |
+
result = {
|
| 340 |
+
"status": "success",
|
| 341 |
+
"signals_ingested": len(signals),
|
| 342 |
+
"trends_updated": trends_count,
|
| 343 |
+
}
|
| 344 |
+
_ingestion_status = {
|
| 345 |
+
"state": "completed",
|
| 346 |
+
"started_at": _ingestion_status["started_at"],
|
| 347 |
+
"finished_at": datetime.now(timezone.utc).isoformat(),
|
| 348 |
+
"signals_ingested": len(signals),
|
| 349 |
+
"trends_updated": trends_count,
|
| 350 |
+
"error": None,
|
| 351 |
+
}
|
| 352 |
+
return result
|
| 353 |
+
except Exception as e:
|
| 354 |
+
logger.exception("Ingestion pipeline failed")
|
| 355 |
+
_ingestion_status = {
|
| 356 |
+
"state": "failed",
|
| 357 |
+
"started_at": _ingestion_status["started_at"],
|
| 358 |
+
"finished_at": datetime.now(timezone.utc).isoformat(),
|
| 359 |
+
"signals_ingested": 0,
|
| 360 |
+
"trends_updated": 0,
|
| 361 |
+
"error": str(e),
|
| 362 |
+
}
|
| 363 |
+
raise
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
@router.post("/ingest")
|
| 367 |
+
async def trigger_ingestion(req: IngestRequest):
|
| 368 |
+
"""Trigger a manual ingestion run.
|
| 369 |
+
|
| 370 |
+
With `background=True`, schedules the run on the event loop and returns
|
| 371 |
+
immediately with status="started" so the mobile UI can show a banner and
|
| 372 |
+
poll `/api/ingest/status` for completion. Otherwise runs inline (used by
|
| 373 |
+
integration tests and curl smoke tests).
|
| 374 |
+
"""
|
| 375 |
+
if not ingestion_agent:
|
| 376 |
+
raise HTTPException(status_code=503, detail="Ingestion agent not ready")
|
| 377 |
+
|
| 378 |
+
if req.background:
|
| 379 |
+
if _ingestion_status.get("state") == "running":
|
| 380 |
+
return {
|
| 381 |
+
"status": "already_running",
|
| 382 |
+
"started_at": _ingestion_status.get("started_at"),
|
| 383 |
+
}
|
| 384 |
+
asyncio.create_task(_run_ingestion_pipeline(req))
|
| 385 |
+
return {
|
| 386 |
+
"status": "started",
|
| 387 |
+
"message": "Ingestion is running in the background. Poll /api/ingest/status.",
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
return await _run_ingestion_pipeline(req)
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
@router.get("/ingest/status")
|
| 394 |
+
async def get_ingestion_status():
|
| 395 |
+
"""Return the state of the last/in-flight ingestion run."""
|
| 396 |
+
return _ingestion_status
|
| 397 |
+
|
| 398 |
+
|
| 399 |
+
@router.get("/trends")
|
| 400 |
+
async def get_trends(limit: int = Query(default=20, le=100)):
|
| 401 |
+
"""Get the trend leaderboard."""
|
| 402 |
+
if not reasoning_agent or not reasoning_agent.trends:
|
| 403 |
+
# Run analysis if no trends exist yet
|
| 404 |
+
if reasoning_agent:
|
| 405 |
+
trends = await reasoning_agent.analyze_trends()
|
| 406 |
+
else:
|
| 407 |
+
return {"trends": [], "count": 0}
|
| 408 |
+
else:
|
| 409 |
+
trends = sorted(
|
| 410 |
+
reasoning_agent.trends.values(),
|
| 411 |
+
key=lambda t: t.emergence_score,
|
| 412 |
+
reverse=True,
|
| 413 |
+
)
|
| 414 |
+
|
| 415 |
+
trend_list = [t.model_dump(mode="json") for t in trends[:limit]]
|
| 416 |
+
return {"trends": trend_list, "count": len(trend_list)}
|
| 417 |
+
|
| 418 |
+
|
| 419 |
+
@router.get("/trends/{trend_id}")
|
| 420 |
+
async def get_trend_detail(
|
| 421 |
+
trend_id: str,
|
| 422 |
+
include_brief: bool = Query(
|
| 423 |
+
default=False,
|
| 424 |
+
description="When true, runs Gemini to generate technical_brief (slow). "
|
| 425 |
+
"Omit or false for instant scores + description.",
|
| 426 |
+
),
|
| 427 |
+
):
|
| 428 |
+
"""Get detailed view of a specific trend."""
|
| 429 |
+
if not reasoning_agent:
|
| 430 |
+
raise HTTPException(status_code=503, detail="Reasoning agent not ready")
|
| 431 |
+
|
| 432 |
+
trend = reasoning_agent.trends.get(trend_id)
|
| 433 |
+
if not trend:
|
| 434 |
+
raise HTTPException(status_code=404, detail="Trend not found")
|
| 435 |
+
|
| 436 |
+
result = trend.model_dump(mode="json")
|
| 437 |
+
if include_brief:
|
| 438 |
+
brief = await reasoning_agent.generate_technical_brief(
|
| 439 |
+
trend.technique_name, trend.description
|
| 440 |
+
)
|
| 441 |
+
result["technical_brief"] = brief
|
| 442 |
+
else:
|
| 443 |
+
result["technical_brief"] = None
|
| 444 |
+
return result
|
| 445 |
+
|
| 446 |
+
|
| 447 |
+
@router.post("/blueprints/generate")
|
| 448 |
+
async def generate_blueprint(req: BlueprintRequest):
|
| 449 |
+
"""Generate a product blueprint for a trend."""
|
| 450 |
+
if not blueprint_engine or not reasoning_agent:
|
| 451 |
+
raise HTTPException(status_code=503, detail="Services not ready")
|
| 452 |
+
|
| 453 |
+
trend = reasoning_agent.trends.get(req.trend_id)
|
| 454 |
+
if not trend:
|
| 455 |
+
raise HTTPException(status_code=404, detail="Trend not found")
|
| 456 |
+
|
| 457 |
+
blueprint = await blueprint_engine.generate_blueprint(
|
| 458 |
+
trend, req.additional_context
|
| 459 |
+
)
|
| 460 |
+
|
| 461 |
+
# Store in memory agent
|
| 462 |
+
if memory_agent:
|
| 463 |
+
memory_agent.store_blueprint(blueprint.id, blueprint.model_dump(mode="json"))
|
| 464 |
+
if database:
|
| 465 |
+
try:
|
| 466 |
+
database.save_blueprint(blueprint.model_dump(mode="json"))
|
| 467 |
+
except Exception as e:
|
| 468 |
+
logger.warning("blueprint persist failed: %s", e)
|
| 469 |
+
|
| 470 |
+
await broadcast_ws("blueprint_generated", {
|
| 471 |
+
"id": blueprint.id,
|
| 472 |
+
"technique": blueprint.technique_name,
|
| 473 |
+
})
|
| 474 |
+
|
| 475 |
+
return blueprint.model_dump(mode="json")
|
| 476 |
+
|
| 477 |
+
|
| 478 |
+
@router.get("/blueprints")
|
| 479 |
+
async def list_blueprints():
|
| 480 |
+
"""List all generated blueprints."""
|
| 481 |
+
if not blueprint_engine:
|
| 482 |
+
return {"blueprints": [], "count": 0}
|
| 483 |
+
bps = [b.model_dump(mode="json") for b in blueprint_engine.list_blueprints()]
|
| 484 |
+
return {"blueprints": bps, "count": len(bps)}
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
@router.get("/blueprints/{blueprint_id}")
|
| 488 |
+
async def get_blueprint(blueprint_id: str):
|
| 489 |
+
"""Get a specific blueprint."""
|
| 490 |
+
if not blueprint_engine:
|
| 491 |
+
raise HTTPException(status_code=503, detail="Blueprint engine not ready")
|
| 492 |
+
bp = blueprint_engine.get_blueprint(blueprint_id)
|
| 493 |
+
if not bp:
|
| 494 |
+
raise HTTPException(status_code=404, detail="Blueprint not found")
|
| 495 |
+
return bp.model_dump(mode="json")
|
| 496 |
+
|
| 497 |
+
|
| 498 |
+
@router.post("/pipelines/generate")
|
| 499 |
+
async def generate_pipeline(req: PipelineRequest):
|
| 500 |
+
"""Generate an ML training pipeline."""
|
| 501 |
+
if not pipeline_generator:
|
| 502 |
+
raise HTTPException(status_code=503, detail="Pipeline generator not ready")
|
| 503 |
+
|
| 504 |
+
pipeline = pipeline_generator.generate_pipeline(
|
| 505 |
+
technique_name=req.technique_name,
|
| 506 |
+
description=req.description,
|
| 507 |
+
task_type=req.task_type,
|
| 508 |
+
)
|
| 509 |
+
|
| 510 |
+
if memory_agent:
|
| 511 |
+
memory_agent.store_pipeline(pipeline.model_dump(mode="json"))
|
| 512 |
+
if database:
|
| 513 |
+
database.save_pipeline(pipeline.model_dump(mode="json"))
|
| 514 |
+
|
| 515 |
+
await broadcast_ws("pipeline_generated", {
|
| 516 |
+
"id": pipeline.id,
|
| 517 |
+
"technique": pipeline.technique_name,
|
| 518 |
+
"task_type": pipeline.task_type,
|
| 519 |
+
})
|
| 520 |
+
if telegram_bot:
|
| 521 |
+
await telegram_bot.send_pipeline_complete(
|
| 522 |
+
technique=pipeline.technique_name,
|
| 523 |
+
task_type=pipeline.task_type,
|
| 524 |
+
metrics=pipeline.metrics,
|
| 525 |
+
colab_url=pipeline.colab_url,
|
| 526 |
+
)
|
| 527 |
+
|
| 528 |
+
return pipeline.model_dump(mode="json")
|
| 529 |
+
|
| 530 |
+
|
| 531 |
+
@router.post("/pipelines/{pipeline_id}/run")
|
| 532 |
+
async def run_pipeline(pipeline_id: str, req: PipelineRunRequest, x_api_key: Optional[str] = Header(default=None)):
|
| 533 |
+
"""Execute a generated pipeline script."""
|
| 534 |
+
_assert_admin_api_key(x_api_key)
|
| 535 |
+
if not pipeline_generator or not pipeline_executor:
|
| 536 |
+
raise HTTPException(status_code=503, detail="Pipeline services not ready")
|
| 537 |
+
|
| 538 |
+
pipeline = pipeline_generator.get_pipeline(pipeline_id)
|
| 539 |
+
if not pipeline:
|
| 540 |
+
raise HTTPException(status_code=404, detail="Pipeline not found")
|
| 541 |
+
|
| 542 |
+
timeout = min(max(req.timeout_seconds, 30), 7200)
|
| 543 |
+
pipeline.status = "training"
|
| 544 |
+
pipeline_generator.update_pipeline(pipeline)
|
| 545 |
+
if database:
|
| 546 |
+
database.save_pipeline(pipeline.model_dump(mode="json"))
|
| 547 |
+
|
| 548 |
+
if req.wait_for_completion:
|
| 549 |
+
run = await pipeline_executor.execute_pipeline(pipeline, timeout_seconds=timeout)
|
| 550 |
+
else:
|
| 551 |
+
run = pipeline_executor.execute_pipeline_async(pipeline, timeout_seconds=timeout)
|
| 552 |
+
|
| 553 |
+
_apply_run_snapshot_to_pipeline(pipeline, run)
|
| 554 |
+
pipeline_generator.update_pipeline(pipeline)
|
| 555 |
+
if database:
|
| 556 |
+
database.save_pipeline(pipeline.model_dump(mode="json"))
|
| 557 |
+
database.save_pipeline_run(run)
|
| 558 |
+
|
| 559 |
+
await broadcast_ws("pipeline_run_started", {
|
| 560 |
+
"pipeline_id": pipeline.id,
|
| 561 |
+
"run_id": run["run_id"],
|
| 562 |
+
"status": run["status"],
|
| 563 |
+
"technique": pipeline.technique_name,
|
| 564 |
+
})
|
| 565 |
+
|
| 566 |
+
return {
|
| 567 |
+
"status": "accepted" if run["status"] in ("queued", "running") else "finished",
|
| 568 |
+
"pipeline": pipeline.model_dump(mode="json"),
|
| 569 |
+
"run": run,
|
| 570 |
+
}
|
| 571 |
+
|
| 572 |
+
@router.post("/pipelines/dataset-candidates")
|
| 573 |
+
async def pipeline_dataset_candidates(req: PipelineDatasetCandidatesRequest):
|
| 574 |
+
"""Preview ranked dataset candidates before pipeline generation."""
|
| 575 |
+
if not pipeline_generator:
|
| 576 |
+
raise HTTPException(status_code=503, detail="Pipeline generator not ready")
|
| 577 |
+
candidates = pipeline_generator.dataset_candidates(
|
| 578 |
+
technique_name=req.technique_name,
|
| 579 |
+
description=req.description,
|
| 580 |
+
task_type=req.task_type,
|
| 581 |
+
top_k=min(max(req.top_k, 1), 20),
|
| 582 |
+
)
|
| 583 |
+
return {"candidates": candidates, "count": len(candidates)}
|
| 584 |
+
|
| 585 |
+
@router.post("/experiments/design")
|
| 586 |
+
async def design_experiment(req: ExperimentDesignRequest):
|
| 587 |
+
"""Generate a minimal viable experiment design for a technique."""
|
| 588 |
+
if not experiment_designer:
|
| 589 |
+
raise HTTPException(status_code=503, detail="Experiment designer not ready")
|
| 590 |
+
exp = await experiment_designer.design_experiment(
|
| 591 |
+
technique_name=req.technique_name,
|
| 592 |
+
brief=req.brief,
|
| 593 |
+
)
|
| 594 |
+
return exp
|
| 595 |
+
|
| 596 |
+
|
| 597 |
+
@router.get("/pipelines")
|
| 598 |
+
async def list_pipelines():
|
| 599 |
+
"""List all generated pipelines."""
|
| 600 |
+
if not pipeline_generator:
|
| 601 |
+
return {"pipelines": [], "count": 0}
|
| 602 |
+
pls = [p.model_dump(mode="json") for p in pipeline_generator.list_pipelines()]
|
| 603 |
+
return {"pipelines": pls, "count": len(pls)}
|
| 604 |
+
|
| 605 |
+
|
| 606 |
+
@router.get("/pipelines/{pipeline_id}")
|
| 607 |
+
async def get_pipeline(pipeline_id: str):
|
| 608 |
+
"""Get a specific pipeline."""
|
| 609 |
+
if not pipeline_generator:
|
| 610 |
+
raise HTTPException(status_code=503, detail="Pipeline generator not ready")
|
| 611 |
+
pl = pipeline_generator.get_pipeline(pipeline_id)
|
| 612 |
+
if not pl:
|
| 613 |
+
raise HTTPException(status_code=404, detail="Pipeline not found")
|
| 614 |
+
return pl.model_dump(mode="json")
|
| 615 |
+
|
| 616 |
+
|
| 617 |
+
@router.get("/pipelines/{pipeline_id}/runs")
|
| 618 |
+
async def list_pipeline_runs(pipeline_id: str):
|
| 619 |
+
"""List runs for a pipeline."""
|
| 620 |
+
if not pipeline_generator or not pipeline_executor:
|
| 621 |
+
raise HTTPException(status_code=503, detail="Pipeline services not ready")
|
| 622 |
+
pipeline = pipeline_generator.get_pipeline(pipeline_id)
|
| 623 |
+
if not pipeline:
|
| 624 |
+
raise HTTPException(status_code=404, detail="Pipeline not found")
|
| 625 |
+
runs = pipeline_executor.list_runs(pipeline_id)
|
| 626 |
+
if not runs and database:
|
| 627 |
+
runs = database.get_pipeline_runs(pipeline_id)
|
| 628 |
+
return {"pipeline_id": pipeline_id, "runs": runs, "count": len(runs)}
|
| 629 |
+
|
| 630 |
+
|
| 631 |
+
@router.get("/pipelines/{pipeline_id}/runs/{run_id}")
|
| 632 |
+
async def get_pipeline_run(pipeline_id: str, run_id: str):
|
| 633 |
+
"""Get run status for a specific pipeline run."""
|
| 634 |
+
if not pipeline_generator or not pipeline_executor:
|
| 635 |
+
raise HTTPException(status_code=503, detail="Pipeline services not ready")
|
| 636 |
+
pipeline = pipeline_generator.get_pipeline(pipeline_id)
|
| 637 |
+
if not pipeline:
|
| 638 |
+
raise HTTPException(status_code=404, detail="Pipeline not found")
|
| 639 |
+
|
| 640 |
+
run = pipeline_executor.get_run(pipeline_id, run_id)
|
| 641 |
+
if not run and database:
|
| 642 |
+
run = database.get_pipeline_run(run_id)
|
| 643 |
+
if not run:
|
| 644 |
+
raise HTTPException(status_code=404, detail="Run not found")
|
| 645 |
+
|
| 646 |
+
_apply_run_snapshot_to_pipeline(pipeline, run)
|
| 647 |
+
pipeline_generator.update_pipeline(pipeline)
|
| 648 |
+
if database:
|
| 649 |
+
database.save_pipeline(pipeline.model_dump(mode="json"))
|
| 650 |
+
database.save_pipeline_run(run)
|
| 651 |
+
|
| 652 |
+
return run
|
| 653 |
+
|
| 654 |
+
|
| 655 |
+
@router.get("/pipelines/{pipeline_id}/runs/{run_id}/log")
|
| 656 |
+
async def get_pipeline_run_log(pipeline_id: str, run_id: str, tail_lines: int = Query(default=200, ge=10, le=2000)):
|
| 657 |
+
"""Read latest log lines for a pipeline run."""
|
| 658 |
+
if not pipeline_generator or not pipeline_executor:
|
| 659 |
+
raise HTTPException(status_code=503, detail="Pipeline services not ready")
|
| 660 |
+
pipeline = pipeline_generator.get_pipeline(pipeline_id)
|
| 661 |
+
if not pipeline:
|
| 662 |
+
raise HTTPException(status_code=404, detail="Pipeline not found")
|
| 663 |
+
run = pipeline_executor.get_run(pipeline_id, run_id)
|
| 664 |
+
if not run:
|
| 665 |
+
raise HTTPException(status_code=404, detail="Run not found")
|
| 666 |
+
|
| 667 |
+
try:
|
| 668 |
+
with open(run["log_path"], "r", encoding="utf-8") as f:
|
| 669 |
+
lines = f.readlines()
|
| 670 |
+
except FileNotFoundError:
|
| 671 |
+
lines = []
|
| 672 |
+
|
| 673 |
+
sliced = lines[-tail_lines:]
|
| 674 |
+
return {
|
| 675 |
+
"pipeline_id": pipeline_id,
|
| 676 |
+
"run_id": run_id,
|
| 677 |
+
"status": run["status"],
|
| 678 |
+
"line_count": len(sliced),
|
| 679 |
+
"log_tail": "".join(sliced),
|
| 680 |
+
}
|
| 681 |
+
|
| 682 |
+
|
| 683 |
+
@router.post("/search")
|
| 684 |
+
async def semantic_search(req: SearchRequest):
|
| 685 |
+
"""Semantic search across all research signals."""
|
| 686 |
+
if not embedding_engine or not vector_store:
|
| 687 |
+
raise HTTPException(status_code=503, detail="Search not ready")
|
| 688 |
+
|
| 689 |
+
query_embedding = embedding_engine.embed_text(req.query)
|
| 690 |
+
results = vector_store.search(
|
| 691 |
+
query_vector=query_embedding,
|
| 692 |
+
top_k=req.top_k,
|
| 693 |
+
source_filter=req.source_filter,
|
| 694 |
+
)
|
| 695 |
+
return {"results": results, "count": len(results), "query": req.query}
|
| 696 |
+
|
| 697 |
+
|
| 698 |
+
@router.get("/vector-map")
|
| 699 |
+
async def get_vector_map(limit: int = Query(default=200, le=500)):
|
| 700 |
+
"""Get 2D projection of vector space for visualization."""
|
| 701 |
+
if not vector_store:
|
| 702 |
+
return {"points": [], "count": 0}
|
| 703 |
+
|
| 704 |
+
vectors, payloads = vector_store.get_vectors_for_projection(limit=limit)
|
| 705 |
+
if not vectors:
|
| 706 |
+
return {"points": [], "count": 0}
|
| 707 |
+
|
| 708 |
+
# Simple 2D projection using PCA (fast for demo)
|
| 709 |
+
import numpy as np
|
| 710 |
+
from sklearn.decomposition import PCA
|
| 711 |
+
|
| 712 |
+
vecs = np.array(vectors)
|
| 713 |
+
if len(vecs) < 2:
|
| 714 |
+
return {"points": [], "count": 0}
|
| 715 |
+
|
| 716 |
+
n_components = min(2, len(vecs), vecs.shape[1])
|
| 717 |
+
pca = PCA(n_components=n_components)
|
| 718 |
+
projected = pca.fit_transform(vecs)
|
| 719 |
+
|
| 720 |
+
points = []
|
| 721 |
+
for i, (coords, payload) in enumerate(zip(projected, payloads)):
|
| 722 |
+
points.append({
|
| 723 |
+
"x": float(coords[0]) if len(coords) > 0 else 0,
|
| 724 |
+
"y": float(coords[1]) if len(coords) > 1 else 0,
|
| 725 |
+
"title": payload.get("title", ""),
|
| 726 |
+
"source": payload.get("source", ""),
|
| 727 |
+
"novelty_score": payload.get("novelty_score", 0),
|
| 728 |
+
"categories": payload.get("categories", []),
|
| 729 |
+
})
|
| 730 |
+
|
| 731 |
+
return {
|
| 732 |
+
"points": points,
|
| 733 |
+
"count": len(points),
|
| 734 |
+
"explained_variance": pca.explained_variance_ratio_.tolist(),
|
| 735 |
+
}
|
| 736 |
+
|
| 737 |
+
|
| 738 |
+
@router.post("/feedback")
|
| 739 |
+
async def submit_feedback(req: FeedbackRequest):
|
| 740 |
+
"""Submit user feedback (upvote/downvote) on a prediction."""
|
| 741 |
+
import uuid
|
| 742 |
+
|
| 743 |
+
feedback_data = {
|
| 744 |
+
"id": str(uuid.uuid4()),
|
| 745 |
+
"target_id": req.target_id,
|
| 746 |
+
"target_type": req.target_type,
|
| 747 |
+
"action": req.action,
|
| 748 |
+
}
|
| 749 |
+
|
| 750 |
+
if database:
|
| 751 |
+
database.save_feedback(feedback_data)
|
| 752 |
+
|
| 753 |
+
if memory_agent:
|
| 754 |
+
await memory_agent.bus.publish_simple(
|
| 755 |
+
"delivery.feedback", "api", feedback_data
|
| 756 |
+
)
|
| 757 |
+
|
| 758 |
+
return {"status": "recorded", "feedback": feedback_data}
|
| 759 |
+
|
| 760 |
+
|
| 761 |
+
# ─── WebSocket for Live Updates ───────────────────────────────
|
| 762 |
+
@router.websocket("/ws/live")
|
| 763 |
+
async def websocket_live(ws: WebSocket):
|
| 764 |
+
"""WebSocket endpoint for real-time dashboard updates."""
|
| 765 |
+
await ws.accept()
|
| 766 |
+
ws_connections.append(ws)
|
| 767 |
+
logger.info(f"WebSocket client connected (total: {len(ws_connections)})")
|
| 768 |
+
|
| 769 |
+
try:
|
| 770 |
+
while True:
|
| 771 |
+
# Keep connection alive, handle incoming messages
|
| 772 |
+
data = await ws.receive_text()
|
| 773 |
+
# Echo or handle client commands
|
| 774 |
+
if data == "ping":
|
| 775 |
+
await ws.send_text(json.dumps({"type": "pong"}))
|
| 776 |
+
except WebSocketDisconnect:
|
| 777 |
+
ws_connections.remove(ws)
|
| 778 |
+
logger.info(f"WebSocket client disconnected (total: {len(ws_connections)})")
|
| 779 |
+
except Exception:
|
| 780 |
+
if ws in ws_connections:
|
| 781 |
+
ws_connections.remove(ws)
|
delivery/colab_publisher.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Publish generated VectorMinds notebooks to a real Google Colab URL.
|
| 2 |
+
|
| 3 |
+
Strategy: create or update a public GitHub Gist (using the existing
|
| 4 |
+
``GITHUB_TOKEN``) holding the ``.ipynb`` file. Colab can open any public Gist via
|
| 5 |
+
``https://colab.research.google.com/gist/<owner>/<gist_id>/<filename>.ipynb``.
|
| 6 |
+
|
| 7 |
+
If no token is available or GitHub is unreachable, ``publish_notebook`` returns
|
| 8 |
+
``None`` so the pipeline still works with an in-memory notebook.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import logging
|
| 14 |
+
from typing import Optional
|
| 15 |
+
|
| 16 |
+
import httpx
|
| 17 |
+
|
| 18 |
+
import config
|
| 19 |
+
|
| 20 |
+
logger = logging.getLogger("vectorminds.colab")
|
| 21 |
+
|
| 22 |
+
GITHUB_API = "https://api.github.com"
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def _auth_headers() -> dict:
|
| 26 |
+
h = {"Accept": "application/vnd.github+json", "User-Agent": "VectorMinds/1.0"}
|
| 27 |
+
if config.GITHUB_TOKEN:
|
| 28 |
+
h["Authorization"] = f"Bearer {config.GITHUB_TOKEN}"
|
| 29 |
+
return h
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _resolve_owner() -> Optional[str]:
|
| 33 |
+
"""Return the GitHub username for the configured ``GITHUB_TOKEN``."""
|
| 34 |
+
if not config.GITHUB_TOKEN:
|
| 35 |
+
return None
|
| 36 |
+
try:
|
| 37 |
+
with httpx.Client(timeout=10.0, headers=_auth_headers()) as client:
|
| 38 |
+
r = client.get(f"{GITHUB_API}/user")
|
| 39 |
+
if r.status_code != 200:
|
| 40 |
+
logger.warning("GitHub /user returned %s: %s", r.status_code, r.text[:200])
|
| 41 |
+
return None
|
| 42 |
+
return (r.json() or {}).get("login")
|
| 43 |
+
except Exception as e:
|
| 44 |
+
logger.warning("GitHub /user failed: %s", e)
|
| 45 |
+
return None
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def publish_notebook(
|
| 49 |
+
notebook_payload: dict,
|
| 50 |
+
filename: str,
|
| 51 |
+
description: str = "",
|
| 52 |
+
public: bool = True,
|
| 53 |
+
) -> Optional[dict]:
|
| 54 |
+
"""Create a Gist holding a single ``.ipynb`` and return ``{owner, gist_id, colab_url, gist_url}``.
|
| 55 |
+
|
| 56 |
+
Returns ``None`` if publishing is not possible (no token, network failure).
|
| 57 |
+
"""
|
| 58 |
+
if not config.GITHUB_TOKEN:
|
| 59 |
+
logger.info("GitHub token not configured; skipping Colab gist publish.")
|
| 60 |
+
return None
|
| 61 |
+
import json
|
| 62 |
+
|
| 63 |
+
body = {
|
| 64 |
+
"description": description or "VectorMinds generated training pipeline",
|
| 65 |
+
"public": bool(public),
|
| 66 |
+
"files": {filename: {"content": json.dumps(notebook_payload, ensure_ascii=False, indent=2)}},
|
| 67 |
+
}
|
| 68 |
+
try:
|
| 69 |
+
with httpx.Client(timeout=20.0, headers=_auth_headers()) as client:
|
| 70 |
+
r = client.post(f"{GITHUB_API}/gists", json=body)
|
| 71 |
+
if r.status_code not in (200, 201):
|
| 72 |
+
logger.warning("Gist create failed %s: %s", r.status_code, r.text[:300])
|
| 73 |
+
return None
|
| 74 |
+
data = r.json()
|
| 75 |
+
except Exception as e:
|
| 76 |
+
logger.warning("Gist create exception: %s", e)
|
| 77 |
+
return None
|
| 78 |
+
|
| 79 |
+
gist_id = data.get("id")
|
| 80 |
+
owner_login = (data.get("owner") or {}).get("login") or _resolve_owner()
|
| 81 |
+
gist_url = data.get("html_url") or (
|
| 82 |
+
f"https://gist.github.com/{owner_login}/{gist_id}" if (owner_login and gist_id) else ""
|
| 83 |
+
)
|
| 84 |
+
if not gist_id:
|
| 85 |
+
return None
|
| 86 |
+
if not owner_login:
|
| 87 |
+
owner_login = _resolve_owner() or "anonymous"
|
| 88 |
+
colab_url = (
|
| 89 |
+
f"https://colab.research.google.com/gist/{owner_login}/{gist_id}/{filename}"
|
| 90 |
+
)
|
| 91 |
+
logger.info("Published Colab gist %s for %s", gist_id, filename)
|
| 92 |
+
return {
|
| 93 |
+
"owner": owner_login,
|
| 94 |
+
"gist_id": gist_id,
|
| 95 |
+
"gist_url": gist_url,
|
| 96 |
+
"colab_url": colab_url,
|
| 97 |
+
"filename": filename,
|
| 98 |
+
}
|
delivery/telegram_bot.py
ADDED
|
@@ -0,0 +1,398 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Production Telegram bot for VectorMinds.
|
| 2 |
+
|
| 3 |
+
Real two-way bot built on ``python-telegram-bot`` 21:
|
| 4 |
+
- subscribers register with ``/start`` (chat ids persisted in Postgres/SQLite)
|
| 5 |
+
- alerts (trend, ingestion summary, pipeline complete) broadcast to every subscriber
|
| 6 |
+
- live commands: ``/start /help /status /trends /pipelines /unsubscribe``
|
| 7 |
+
|
| 8 |
+
The bot has no mock fallback: when ``TELEGRAM_BOT_TOKEN`` is unset the module logs a
|
| 9 |
+
clear warning and ``send_*`` methods become no-ops returning ``False``. When a token
|
| 10 |
+
is set, every send is a real Telegram API call and failures are returned as ``False``.
|
| 11 |
+
|
| 12 |
+
The bot lifecycle (``start_polling`` / ``stop``) is managed from ``main.lifespan`` so
|
| 13 |
+
polling runs alongside FastAPI without blocking the event loop.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
import asyncio
|
| 19 |
+
import logging
|
| 20 |
+
from datetime import datetime, timezone
|
| 21 |
+
from typing import Awaitable, Callable, Optional
|
| 22 |
+
|
| 23 |
+
from telegram import Update
|
| 24 |
+
from telegram.constants import ParseMode
|
| 25 |
+
from telegram.error import TelegramError
|
| 26 |
+
from telegram.ext import (
|
| 27 |
+
Application,
|
| 28 |
+
ApplicationBuilder,
|
| 29 |
+
CommandHandler,
|
| 30 |
+
ContextTypes,
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
import config
|
| 34 |
+
|
| 35 |
+
logger = logging.getLogger("vectorminds.telegram")
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
WELCOME = (
|
| 39 |
+
"<b>VectorMinds</b> - GenAI Research Intelligence\n"
|
| 40 |
+
"You are subscribed to live alerts.\n\n"
|
| 41 |
+
"Commands:\n"
|
| 42 |
+
"/help - show this help\n"
|
| 43 |
+
"/status - platform stats\n"
|
| 44 |
+
"/trends - top emerging techniques\n"
|
| 45 |
+
"/pipelines - recent ML pipelines\n"
|
| 46 |
+
"/unsubscribe - stop receiving alerts"
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
HELP_MESSAGE = (
|
| 50 |
+
"<b>VectorMinds Bot</b>\n"
|
| 51 |
+
"/start - subscribe and show this menu\n"
|
| 52 |
+
"/help - show this help\n"
|
| 53 |
+
"/status - platform stats (signals, trends, pipelines)\n"
|
| 54 |
+
"/trends - top 5 ranked techniques\n"
|
| 55 |
+
"/pipelines - recent ML pipelines\n"
|
| 56 |
+
"/unsubscribe - stop receiving alerts"
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class TelegramBot:
|
| 61 |
+
"""Production Telegram delivery and command surface for VectorMinds."""
|
| 62 |
+
|
| 63 |
+
def __init__(self, token: str = "", database=None):
|
| 64 |
+
self.token = (token or config.TELEGRAM_BOT_TOKEN or "").strip()
|
| 65 |
+
self.database = database
|
| 66 |
+
self.enabled: bool = bool(self.token)
|
| 67 |
+
self._sent_count = 0
|
| 68 |
+
self._failed_count = 0
|
| 69 |
+
self._app: Optional[Application] = None
|
| 70 |
+
self._polling_started = False
|
| 71 |
+
# Optional callable returning a fresh ``stats`` dict for ``/status``.
|
| 72 |
+
self._stats_provider: Optional[Callable[[], Awaitable[dict]]] = None
|
| 73 |
+
# Optional callable returning ``list[dict]`` of trends for ``/trends``.
|
| 74 |
+
self._trends_provider: Optional[Callable[[int], Awaitable[list[dict]]]] = None
|
| 75 |
+
# Optional callable returning ``list[dict]`` of pipelines for ``/pipelines``.
|
| 76 |
+
self._pipelines_provider: Optional[Callable[[int], Awaitable[list[dict]]]] = None
|
| 77 |
+
if not self.enabled:
|
| 78 |
+
logger.warning(
|
| 79 |
+
"Telegram bot disabled - set TELEGRAM_BOT_TOKEN to enable real delivery"
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# ── lifecycle ─────────────────────────────────────────────
|
| 83 |
+
|
| 84 |
+
async def start_polling(self) -> None:
|
| 85 |
+
"""Start the long-polling task. Safe to call once; idempotent on retry."""
|
| 86 |
+
if not self.enabled:
|
| 87 |
+
return
|
| 88 |
+
if self._polling_started:
|
| 89 |
+
return
|
| 90 |
+
try:
|
| 91 |
+
self._app = (
|
| 92 |
+
ApplicationBuilder()
|
| 93 |
+
.token(self.token)
|
| 94 |
+
.concurrent_updates(True)
|
| 95 |
+
.build()
|
| 96 |
+
)
|
| 97 |
+
self._app.add_handler(CommandHandler("start", self._cmd_start))
|
| 98 |
+
self._app.add_handler(CommandHandler("help", self._cmd_help))
|
| 99 |
+
self._app.add_handler(CommandHandler("status", self._cmd_status))
|
| 100 |
+
self._app.add_handler(CommandHandler("trends", self._cmd_trends))
|
| 101 |
+
self._app.add_handler(CommandHandler("pipelines", self._cmd_pipelines))
|
| 102 |
+
self._app.add_handler(CommandHandler("unsubscribe", self._cmd_unsubscribe))
|
| 103 |
+
|
| 104 |
+
await self._app.initialize()
|
| 105 |
+
await self._app.start()
|
| 106 |
+
await self._app.updater.start_polling(drop_pending_updates=False)
|
| 107 |
+
self._polling_started = True
|
| 108 |
+
me = await self._app.bot.get_me()
|
| 109 |
+
count = self._subscriber_count()
|
| 110 |
+
logger.info(
|
| 111 |
+
"Telegram bot @%s online (subscribers=%s)", me.username, count
|
| 112 |
+
)
|
| 113 |
+
except Exception as e:
|
| 114 |
+
logger.error("Telegram polling failed to start: %s", e)
|
| 115 |
+
self.enabled = False
|
| 116 |
+
self._app = None
|
| 117 |
+
self._polling_started = False
|
| 118 |
+
|
| 119 |
+
async def stop_polling(self) -> None:
|
| 120 |
+
"""Cleanly stop the polling task. Safe to call multiple times."""
|
| 121 |
+
app = self._app
|
| 122 |
+
if not app:
|
| 123 |
+
return
|
| 124 |
+
try:
|
| 125 |
+
if app.updater and app.updater.running:
|
| 126 |
+
await app.updater.stop()
|
| 127 |
+
if app.running:
|
| 128 |
+
await app.stop()
|
| 129 |
+
await app.shutdown()
|
| 130 |
+
except Exception as e:
|
| 131 |
+
logger.warning("Telegram bot stop encountered: %s", e)
|
| 132 |
+
finally:
|
| 133 |
+
self._app = None
|
| 134 |
+
self._polling_started = False
|
| 135 |
+
|
| 136 |
+
def attach_providers(
|
| 137 |
+
self,
|
| 138 |
+
stats: Optional[Callable[[], Awaitable[dict]]] = None,
|
| 139 |
+
trends: Optional[Callable[[int], Awaitable[list[dict]]]] = None,
|
| 140 |
+
pipelines: Optional[Callable[[int], Awaitable[list[dict]]]] = None,
|
| 141 |
+
) -> None:
|
| 142 |
+
"""Wire callables that resolve dynamic data for ``/status``, ``/trends`` and ``/pipelines``."""
|
| 143 |
+
if stats is not None:
|
| 144 |
+
self._stats_provider = stats
|
| 145 |
+
if trends is not None:
|
| 146 |
+
self._trends_provider = trends
|
| 147 |
+
if pipelines is not None:
|
| 148 |
+
self._pipelines_provider = pipelines
|
| 149 |
+
|
| 150 |
+
# ── command handlers ─────────────────────────────────────
|
| 151 |
+
|
| 152 |
+
async def _cmd_start(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
| 153 |
+
chat = update.effective_chat
|
| 154 |
+
user = update.effective_user
|
| 155 |
+
if not chat:
|
| 156 |
+
return
|
| 157 |
+
self._upsert_subscriber(chat.id, user.username if user else None)
|
| 158 |
+
await context.bot.send_message(
|
| 159 |
+
chat_id=chat.id, text=WELCOME, parse_mode=ParseMode.HTML
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
async def _cmd_help(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
| 163 |
+
chat = update.effective_chat
|
| 164 |
+
if not chat:
|
| 165 |
+
return
|
| 166 |
+
await context.bot.send_message(
|
| 167 |
+
chat_id=chat.id, text=HELP_MESSAGE, parse_mode=ParseMode.HTML
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
async def _cmd_status(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
| 171 |
+
chat = update.effective_chat
|
| 172 |
+
if not chat:
|
| 173 |
+
return
|
| 174 |
+
if not self._stats_provider:
|
| 175 |
+
await context.bot.send_message(
|
| 176 |
+
chat_id=chat.id, text="Stats not available right now."
|
| 177 |
+
)
|
| 178 |
+
return
|
| 179 |
+
try:
|
| 180 |
+
data = await self._stats_provider()
|
| 181 |
+
except Exception as e:
|
| 182 |
+
logger.warning("status provider failed: %s", e)
|
| 183 |
+
await context.bot.send_message(chat_id=chat.id, text="Stats are temporarily unavailable.")
|
| 184 |
+
return
|
| 185 |
+
agents = data.get("agents_status") or {}
|
| 186 |
+
text = (
|
| 187 |
+
"<b>VectorMinds status</b>\n"
|
| 188 |
+
f"Total signals: <b>{data.get('total_signals', 0)}</b>\n"
|
| 189 |
+
f"Active trends: <b>{data.get('active_trends', 0)}</b>\n"
|
| 190 |
+
f"Blueprints: <b>{data.get('blueprints_generated', 0)}</b>\n"
|
| 191 |
+
f"Pipelines: <b>{data.get('pipelines_launched', 0)}</b>\n"
|
| 192 |
+
f"Avg novelty: <b>{data.get('avg_novelty_score', 0)}</b>\n"
|
| 193 |
+
f"Agents: ingestion=<b>{agents.get('ingestion', '?')}</b> "
|
| 194 |
+
f"reasoning=<b>{agents.get('reasoning', '?')}</b> "
|
| 195 |
+
f"memory=<b>{agents.get('memory', '?')}</b>"
|
| 196 |
+
)
|
| 197 |
+
await context.bot.send_message(chat_id=chat.id, text=text, parse_mode=ParseMode.HTML)
|
| 198 |
+
|
| 199 |
+
async def _cmd_trends(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
| 200 |
+
chat = update.effective_chat
|
| 201 |
+
if not chat:
|
| 202 |
+
return
|
| 203 |
+
if not self._trends_provider:
|
| 204 |
+
await context.bot.send_message(chat_id=chat.id, text="Trend service is offline.")
|
| 205 |
+
return
|
| 206 |
+
try:
|
| 207 |
+
trends = await self._trends_provider(5)
|
| 208 |
+
except Exception as e:
|
| 209 |
+
logger.warning("trend provider failed: %s", e)
|
| 210 |
+
await context.bot.send_message(chat_id=chat.id, text="Trends temporarily unavailable.")
|
| 211 |
+
return
|
| 212 |
+
if not trends:
|
| 213 |
+
await context.bot.send_message(
|
| 214 |
+
chat_id=chat.id,
|
| 215 |
+
text="No trends yet. Trigger an ingestion run from the API and try again.",
|
| 216 |
+
)
|
| 217 |
+
return
|
| 218 |
+
lines = ["<b>Top Trends</b>"]
|
| 219 |
+
for i, t in enumerate(trends, start=1):
|
| 220 |
+
lines.append(
|
| 221 |
+
f"{i}. <b>{t.get('technique_name', '?')}</b> | "
|
| 222 |
+
f"emergence={float(t.get('emergence_score', 0)):.2f} | "
|
| 223 |
+
f"ETA {t.get('mainstream_eta_months', '?')}mo"
|
| 224 |
+
)
|
| 225 |
+
await context.bot.send_message(
|
| 226 |
+
chat_id=chat.id, text="\n".join(lines), parse_mode=ParseMode.HTML
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
async def _cmd_pipelines(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
| 230 |
+
chat = update.effective_chat
|
| 231 |
+
if not chat:
|
| 232 |
+
return
|
| 233 |
+
if not self._pipelines_provider:
|
| 234 |
+
await context.bot.send_message(chat_id=chat.id, text="Pipelines service is offline.")
|
| 235 |
+
return
|
| 236 |
+
try:
|
| 237 |
+
pipelines = await self._pipelines_provider(5)
|
| 238 |
+
except Exception as e:
|
| 239 |
+
logger.warning("pipeline provider failed: %s", e)
|
| 240 |
+
await context.bot.send_message(chat_id=chat.id, text="Pipelines temporarily unavailable.")
|
| 241 |
+
return
|
| 242 |
+
if not pipelines:
|
| 243 |
+
await context.bot.send_message(
|
| 244 |
+
chat_id=chat.id,
|
| 245 |
+
text="No pipelines generated yet. Use /api/pipelines/generate.",
|
| 246 |
+
)
|
| 247 |
+
return
|
| 248 |
+
lines = ["<b>Recent Pipelines</b>"]
|
| 249 |
+
for i, p in enumerate(pipelines, start=1):
|
| 250 |
+
colab = p.get("colab_url") or ""
|
| 251 |
+
line = (
|
| 252 |
+
f"{i}. <b>{p.get('technique_name', '?')}</b> "
|
| 253 |
+
f"({p.get('task_type', '?')}, {p.get('status', '?')})"
|
| 254 |
+
)
|
| 255 |
+
if colab.startswith("https://"):
|
| 256 |
+
line += f"\n <a href=\"{colab}\">Open in Colab</a>"
|
| 257 |
+
lines.append(line)
|
| 258 |
+
await context.bot.send_message(
|
| 259 |
+
chat_id=chat.id,
|
| 260 |
+
text="\n".join(lines),
|
| 261 |
+
parse_mode=ParseMode.HTML,
|
| 262 |
+
disable_web_page_preview=True,
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
async def _cmd_unsubscribe(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
|
| 266 |
+
chat = update.effective_chat
|
| 267 |
+
if not chat:
|
| 268 |
+
return
|
| 269 |
+
self._delete_subscriber(chat.id)
|
| 270 |
+
await context.bot.send_message(
|
| 271 |
+
chat_id=chat.id,
|
| 272 |
+
text="You will no longer receive VectorMinds alerts. /start any time to subscribe again.",
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
# ── outbound delivery ─────────────────────────────────────
|
| 276 |
+
|
| 277 |
+
async def _send_to(self, chat_id: int | str, text: str) -> bool:
|
| 278 |
+
if not self.enabled or not self._app:
|
| 279 |
+
return False
|
| 280 |
+
try:
|
| 281 |
+
await self._app.bot.send_message(
|
| 282 |
+
chat_id=chat_id,
|
| 283 |
+
text=text,
|
| 284 |
+
parse_mode=ParseMode.HTML,
|
| 285 |
+
disable_web_page_preview=True,
|
| 286 |
+
)
|
| 287 |
+
self._sent_count += 1
|
| 288 |
+
return True
|
| 289 |
+
except TelegramError as e:
|
| 290 |
+
logger.warning("Telegram send to %s failed: %s", chat_id, e)
|
| 291 |
+
self._failed_count += 1
|
| 292 |
+
return False
|
| 293 |
+
|
| 294 |
+
async def broadcast(self, text: str) -> int:
|
| 295 |
+
"""Send to every subscriber. Returns number of successful deliveries."""
|
| 296 |
+
if not self.enabled or not self._app:
|
| 297 |
+
return 0
|
| 298 |
+
chat_ids = self._list_subscriber_ids()
|
| 299 |
+
if not chat_ids:
|
| 300 |
+
logger.info("Telegram broadcast skipped (no subscribers).")
|
| 301 |
+
return 0
|
| 302 |
+
results = await asyncio.gather(
|
| 303 |
+
*(self._send_to(cid, text) for cid in chat_ids), return_exceptions=False
|
| 304 |
+
)
|
| 305 |
+
sent = sum(1 for ok in results if ok)
|
| 306 |
+
return sent
|
| 307 |
+
|
| 308 |
+
async def send_message(self, text: str, parse_mode: str = "HTML") -> bool:
|
| 309 |
+
"""Backward-compatible single-call entrypoint that broadcasts to all subscribers."""
|
| 310 |
+
sent = await self.broadcast(text)
|
| 311 |
+
return sent > 0
|
| 312 |
+
|
| 313 |
+
async def send_trend_alert(self, technique: str, score: float, eta: int) -> bool:
|
| 314 |
+
msg = (
|
| 315 |
+
"<b>VectorMinds - New High-Impact Trend</b>\n\n"
|
| 316 |
+
f"<b>Technique:</b> {technique}\n"
|
| 317 |
+
f"<b>Emergence:</b> {score:.2f}\n"
|
| 318 |
+
f"<b>Mainstream ETA:</b> {eta} months\n\n"
|
| 319 |
+
"View in app or /trends"
|
| 320 |
+
)
|
| 321 |
+
return await self.send_message(msg)
|
| 322 |
+
|
| 323 |
+
async def send_pipeline_complete(
|
| 324 |
+
self, technique: str, task_type: str, metrics: dict, colab_url: str = ""
|
| 325 |
+
) -> bool:
|
| 326 |
+
metrics_str = ", ".join(f"{k}: {v}" for k, v in (metrics or {}).items() if not isinstance(v, dict))
|
| 327 |
+
msg = (
|
| 328 |
+
"<b>VectorMinds - Training Pipeline Ready</b>\n\n"
|
| 329 |
+
f"<b>Technique:</b> {technique}\n"
|
| 330 |
+
f"<b>Task:</b> {task_type}\n"
|
| 331 |
+
f"<b>Highlights:</b> {metrics_str}"
|
| 332 |
+
)
|
| 333 |
+
if colab_url:
|
| 334 |
+
msg += f"\n\n<a href=\"{colab_url}\">Open in Colab</a>"
|
| 335 |
+
return await self.send_message(msg)
|
| 336 |
+
|
| 337 |
+
async def send_ingestion_summary(self, paper_count: int, repo_count: int) -> bool:
|
| 338 |
+
msg = (
|
| 339 |
+
"<b>VectorMinds - Ingestion Complete</b>\n\n"
|
| 340 |
+
f"<b>New Papers:</b> {paper_count}\n"
|
| 341 |
+
f"<b>New Repos:</b> {repo_count}\n"
|
| 342 |
+
f"<b>Total Signals:</b> {paper_count + repo_count}"
|
| 343 |
+
)
|
| 344 |
+
return await self.send_message(msg)
|
| 345 |
+
|
| 346 |
+
# ── subscriber persistence ────────────────────────────────
|
| 347 |
+
|
| 348 |
+
def _ensure_table(self) -> bool:
|
| 349 |
+
if not self.database:
|
| 350 |
+
return False
|
| 351 |
+
try:
|
| 352 |
+
self.database.ensure_telegram_subscribers_table()
|
| 353 |
+
return True
|
| 354 |
+
except Exception as e:
|
| 355 |
+
logger.warning("Telegram subscriber table not available: %s", e)
|
| 356 |
+
return False
|
| 357 |
+
|
| 358 |
+
def _upsert_subscriber(self, chat_id: int, username: Optional[str]) -> None:
|
| 359 |
+
if not self._ensure_table():
|
| 360 |
+
return
|
| 361 |
+
try:
|
| 362 |
+
self.database.upsert_telegram_subscriber(int(chat_id), username or "")
|
| 363 |
+
logger.info("Telegram subscriber added: %s", chat_id)
|
| 364 |
+
except Exception as e:
|
| 365 |
+
logger.warning("Failed to persist subscriber %s: %s", chat_id, e)
|
| 366 |
+
|
| 367 |
+
def _delete_subscriber(self, chat_id: int) -> None:
|
| 368 |
+
if not self._ensure_table():
|
| 369 |
+
return
|
| 370 |
+
try:
|
| 371 |
+
self.database.delete_telegram_subscriber(int(chat_id))
|
| 372 |
+
logger.info("Telegram subscriber removed: %s", chat_id)
|
| 373 |
+
except Exception as e:
|
| 374 |
+
logger.warning("Failed to delete subscriber %s: %s", chat_id, e)
|
| 375 |
+
|
| 376 |
+
def _list_subscriber_ids(self) -> list[int]:
|
| 377 |
+
if not self._ensure_table():
|
| 378 |
+
return []
|
| 379 |
+
try:
|
| 380 |
+
return list(self.database.list_telegram_subscriber_ids())
|
| 381 |
+
except Exception as e:
|
| 382 |
+
logger.warning("Failed to list subscribers: %s", e)
|
| 383 |
+
return []
|
| 384 |
+
|
| 385 |
+
def _subscriber_count(self) -> int:
|
| 386 |
+
return len(self._list_subscriber_ids())
|
| 387 |
+
|
| 388 |
+
# ── stats ────────────────────────────────────────────────
|
| 389 |
+
|
| 390 |
+
def get_stats(self) -> dict:
|
| 391 |
+
return {
|
| 392 |
+
"enabled": self.enabled,
|
| 393 |
+
"polling": self._polling_started,
|
| 394 |
+
"subscribers": self._subscriber_count(),
|
| 395 |
+
"messages_sent": self._sent_count,
|
| 396 |
+
"messages_failed": self._failed_count,
|
| 397 |
+
"now": datetime.now(timezone.utc).isoformat(),
|
| 398 |
+
}
|
embeddings/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# VectorMinds Embeddings Package
|
embeddings/engine.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Vector Embedding Engine — Semantic core of VectorMinds.
|
| 2 |
+
|
| 3 |
+
Handles hierarchical chunking, contrastive embeddings via BGE,
|
| 4 |
+
semantic deduplication, and batch encoding.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
from typing import Optional
|
| 11 |
+
|
| 12 |
+
import numpy as np
|
| 13 |
+
from sentence_transformers import SentenceTransformer
|
| 14 |
+
|
| 15 |
+
import config
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger("vectorminds.embedding")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class EmbeddingEngine:
|
| 21 |
+
"""Manages text embedding using a sentence-transformer model."""
|
| 22 |
+
|
| 23 |
+
_instance: Optional["EmbeddingEngine"] = None
|
| 24 |
+
_model: Optional[SentenceTransformer] = None
|
| 25 |
+
|
| 26 |
+
@classmethod
|
| 27 |
+
def get_instance(cls) -> "EmbeddingEngine":
|
| 28 |
+
if cls._instance is None:
|
| 29 |
+
cls._instance = cls()
|
| 30 |
+
return cls._instance
|
| 31 |
+
|
| 32 |
+
def __init__(self):
|
| 33 |
+
self._model = None
|
| 34 |
+
|
| 35 |
+
def load_model(self):
|
| 36 |
+
"""Load the embedding model (lazy initialization)."""
|
| 37 |
+
if self._model is None:
|
| 38 |
+
logger.info(f"Loading embedding model: {config.EMBEDDING_MODEL}")
|
| 39 |
+
self._model = SentenceTransformer(config.EMBEDDING_MODEL)
|
| 40 |
+
logger.info("Embedding model loaded successfully")
|
| 41 |
+
|
| 42 |
+
@property
|
| 43 |
+
def model(self) -> SentenceTransformer:
|
| 44 |
+
if self._model is None:
|
| 45 |
+
self.load_model()
|
| 46 |
+
return self._model
|
| 47 |
+
|
| 48 |
+
def embed_text(self, text: str) -> list[float]:
|
| 49 |
+
"""Embed a single text string.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
text: Input text to embed
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
Embedding vector as list of floats
|
| 56 |
+
"""
|
| 57 |
+
embedding = self.model.encode(text, normalize_embeddings=True)
|
| 58 |
+
return embedding.tolist()
|
| 59 |
+
|
| 60 |
+
def embed_batch(self, texts: list[str], batch_size: int = 32) -> list[list[float]]:
|
| 61 |
+
"""Embed a batch of texts efficiently.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
texts: List of input texts
|
| 65 |
+
batch_size: Encoding batch size
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
List of embedding vectors
|
| 69 |
+
"""
|
| 70 |
+
if not texts:
|
| 71 |
+
return []
|
| 72 |
+
|
| 73 |
+
logger.info(f"Embedding batch of {len(texts)} texts")
|
| 74 |
+
embeddings = self.model.encode(
|
| 75 |
+
texts,
|
| 76 |
+
batch_size=batch_size,
|
| 77 |
+
normalize_embeddings=True,
|
| 78 |
+
show_progress_bar=False,
|
| 79 |
+
)
|
| 80 |
+
return embeddings.tolist()
|
| 81 |
+
|
| 82 |
+
def chunk_text(self, text: str, max_chunk_size: int = 512) -> list[str]:
|
| 83 |
+
"""Hierarchical chunking — split text into semantic chunks.
|
| 84 |
+
|
| 85 |
+
Implements paragraph-level chunking with overlap for better
|
| 86 |
+
retrieval granularity.
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
text: Input text to chunk
|
| 90 |
+
max_chunk_size: Maximum characters per chunk
|
| 91 |
+
|
| 92 |
+
Returns:
|
| 93 |
+
List of text chunks
|
| 94 |
+
"""
|
| 95 |
+
if len(text) <= max_chunk_size:
|
| 96 |
+
return [text]
|
| 97 |
+
|
| 98 |
+
# Split by sentences first
|
| 99 |
+
sentences = text.replace(". ", ".\n").split("\n")
|
| 100 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
| 101 |
+
|
| 102 |
+
chunks = []
|
| 103 |
+
current_chunk = ""
|
| 104 |
+
|
| 105 |
+
for sentence in sentences:
|
| 106 |
+
if len(current_chunk) + len(sentence) + 1 <= max_chunk_size:
|
| 107 |
+
current_chunk += (" " if current_chunk else "") + sentence
|
| 108 |
+
else:
|
| 109 |
+
if current_chunk:
|
| 110 |
+
chunks.append(current_chunk)
|
| 111 |
+
current_chunk = sentence
|
| 112 |
+
|
| 113 |
+
if current_chunk:
|
| 114 |
+
chunks.append(current_chunk)
|
| 115 |
+
|
| 116 |
+
return chunks if chunks else [text]
|
| 117 |
+
|
| 118 |
+
def compute_similarity(self, vec_a: list[float], vec_b: list[float]) -> float:
|
| 119 |
+
"""Compute cosine similarity between two vectors."""
|
| 120 |
+
a = np.array(vec_a)
|
| 121 |
+
b = np.array(vec_b)
|
| 122 |
+
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8))
|
| 123 |
+
|
| 124 |
+
def is_duplicate(
|
| 125 |
+
self,
|
| 126 |
+
embedding: list[float],
|
| 127 |
+
existing_embeddings: list[list[float]],
|
| 128 |
+
threshold: float = None,
|
| 129 |
+
) -> bool:
|
| 130 |
+
"""Check if an embedding is a semantic duplicate of any existing embedding.
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
embedding: New embedding to check
|
| 134 |
+
existing_embeddings: List of existing embeddings
|
| 135 |
+
threshold: Similarity threshold (default from config)
|
| 136 |
+
|
| 137 |
+
Returns:
|
| 138 |
+
True if duplicate detected
|
| 139 |
+
"""
|
| 140 |
+
if threshold is None:
|
| 141 |
+
threshold = config.DEDUP_SIMILARITY_THRESHOLD
|
| 142 |
+
|
| 143 |
+
if not existing_embeddings:
|
| 144 |
+
return False
|
| 145 |
+
|
| 146 |
+
new_vec = np.array(embedding)
|
| 147 |
+
for existing in existing_embeddings:
|
| 148 |
+
sim = float(
|
| 149 |
+
np.dot(new_vec, np.array(existing))
|
| 150 |
+
/ (np.linalg.norm(new_vec) * np.linalg.norm(existing) + 1e-8)
|
| 151 |
+
)
|
| 152 |
+
if sim >= threshold:
|
| 153 |
+
return True
|
| 154 |
+
return False
|
embeddings/vector_store.py
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Qdrant Vector Store — Semantic search and storage layer.
|
| 2 |
+
|
| 3 |
+
Wraps Qdrant client in in-memory mode for the hackathon MVP.
|
| 4 |
+
Handles collection management, upsert, k-NN search, and novelty computation.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
from typing import Optional
|
| 12 |
+
|
| 13 |
+
import numpy as np
|
| 14 |
+
from qdrant_client import QdrantClient
|
| 15 |
+
from qdrant_client.models import (
|
| 16 |
+
Distance,
|
| 17 |
+
PointStruct,
|
| 18 |
+
VectorParams,
|
| 19 |
+
Filter,
|
| 20 |
+
FieldCondition,
|
| 21 |
+
MatchValue,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
import config
|
| 25 |
+
|
| 26 |
+
logger = logging.getLogger("vectormind.vectorstore")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class VectorStore:
|
| 30 |
+
"""Qdrant-based vector store for research signals."""
|
| 31 |
+
|
| 32 |
+
_instance: Optional["VectorStore"] = None
|
| 33 |
+
|
| 34 |
+
@classmethod
|
| 35 |
+
def get_instance(cls) -> "VectorStore":
|
| 36 |
+
if cls._instance is None:
|
| 37 |
+
cls._instance = cls()
|
| 38 |
+
return cls._instance
|
| 39 |
+
|
| 40 |
+
def __init__(self):
|
| 41 |
+
self.client: Optional[QdrantClient] = None
|
| 42 |
+
self.collection_name = config.QDRANT_COLLECTION
|
| 43 |
+
self.concept_graph: dict[str, list[dict]] = {} # Concept -> [neighboring concepts with time]
|
| 44 |
+
|
| 45 |
+
def initialize(self):
|
| 46 |
+
"""Initialize Qdrant client and create collection."""
|
| 47 |
+
if config.QDRANT_HOST:
|
| 48 |
+
self.client = QdrantClient(
|
| 49 |
+
host=config.QDRANT_HOST, port=config.QDRANT_PORT
|
| 50 |
+
)
|
| 51 |
+
else:
|
| 52 |
+
# In-memory mode for hackathon
|
| 53 |
+
self.client = QdrantClient(":memory:")
|
| 54 |
+
|
| 55 |
+
# Create collection if it doesn't exist
|
| 56 |
+
collections = self.client.get_collections().collections
|
| 57 |
+
exists = any(c.name == self.collection_name for c in collections)
|
| 58 |
+
|
| 59 |
+
if not exists:
|
| 60 |
+
self.client.create_collection(
|
| 61 |
+
collection_name=self.collection_name,
|
| 62 |
+
vectors_config=VectorParams(
|
| 63 |
+
size=config.EMBEDDING_DIM,
|
| 64 |
+
distance=Distance.COSINE,
|
| 65 |
+
),
|
| 66 |
+
)
|
| 67 |
+
logger.info(
|
| 68 |
+
f"Created Qdrant collection '{self.collection_name}' "
|
| 69 |
+
f"(dim={config.EMBEDDING_DIM})"
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
def upsert_signal(
|
| 73 |
+
self,
|
| 74 |
+
signal_id: str,
|
| 75 |
+
embedding: list[float],
|
| 76 |
+
payload: dict,
|
| 77 |
+
):
|
| 78 |
+
"""Store a research signal vector with metadata.
|
| 79 |
+
|
| 80 |
+
Args:
|
| 81 |
+
signal_id: Unique signal identifier
|
| 82 |
+
embedding: Vector embedding
|
| 83 |
+
payload: Metadata payload (title, source, scores, etc.)
|
| 84 |
+
"""
|
| 85 |
+
self.client.upsert(
|
| 86 |
+
collection_name=self.collection_name,
|
| 87 |
+
points=[
|
| 88 |
+
PointStruct(
|
| 89 |
+
id=signal_id.replace("-", "")[:32], # Qdrant needs specific ID format
|
| 90 |
+
vector=embedding,
|
| 91 |
+
payload=payload,
|
| 92 |
+
)
|
| 93 |
+
],
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
def upsert_batch(
|
| 97 |
+
self,
|
| 98 |
+
ids: list[str],
|
| 99 |
+
embeddings: list[list[float]],
|
| 100 |
+
payloads: list[dict],
|
| 101 |
+
):
|
| 102 |
+
"""Batch upsert multiple vectors."""
|
| 103 |
+
points = [
|
| 104 |
+
PointStruct(
|
| 105 |
+
id=idx,
|
| 106 |
+
vector=emb,
|
| 107 |
+
payload=pay,
|
| 108 |
+
)
|
| 109 |
+
for idx, (emb, pay) in enumerate(zip(embeddings, payloads))
|
| 110 |
+
]
|
| 111 |
+
if points:
|
| 112 |
+
self.client.upsert(
|
| 113 |
+
collection_name=self.collection_name,
|
| 114 |
+
points=points,
|
| 115 |
+
)
|
| 116 |
+
logger.info(f"Upserted {len(points)} vectors")
|
| 117 |
+
|
| 118 |
+
def search(
|
| 119 |
+
self,
|
| 120 |
+
query_vector: list[float],
|
| 121 |
+
top_k: int = 10,
|
| 122 |
+
source_filter: Optional[str] = None,
|
| 123 |
+
) -> list[dict]:
|
| 124 |
+
"""Semantic similarity search.
|
| 125 |
+
|
| 126 |
+
Args:
|
| 127 |
+
query_vector: Query embedding
|
| 128 |
+
top_k: Number of results to return
|
| 129 |
+
source_filter: Optional filter by source type
|
| 130 |
+
|
| 131 |
+
Returns:
|
| 132 |
+
List of dicts with score and payload
|
| 133 |
+
"""
|
| 134 |
+
query_filter = None
|
| 135 |
+
if source_filter:
|
| 136 |
+
query_filter = Filter(
|
| 137 |
+
must=[
|
| 138 |
+
FieldCondition(
|
| 139 |
+
key="source",
|
| 140 |
+
match=MatchValue(value=source_filter),
|
| 141 |
+
)
|
| 142 |
+
]
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
response = self.client.query_points(
|
| 146 |
+
collection_name=self.collection_name,
|
| 147 |
+
query=query_vector,
|
| 148 |
+
limit=top_k,
|
| 149 |
+
query_filter=query_filter,
|
| 150 |
+
with_payload=True,
|
| 151 |
+
)
|
| 152 |
+
results = response.points
|
| 153 |
+
|
| 154 |
+
return [
|
| 155 |
+
{
|
| 156 |
+
"id": str(r.id),
|
| 157 |
+
"score": r.score,
|
| 158 |
+
"payload": r.payload,
|
| 159 |
+
}
|
| 160 |
+
for r in results
|
| 161 |
+
]
|
| 162 |
+
|
| 163 |
+
def get_collection_count(self) -> int:
|
| 164 |
+
"""Get total number of vectors in the collection."""
|
| 165 |
+
try:
|
| 166 |
+
info = self.client.get_collection(self.collection_name)
|
| 167 |
+
return info.points_count
|
| 168 |
+
except Exception:
|
| 169 |
+
return 0
|
| 170 |
+
|
| 171 |
+
def compute_novelty_score(
|
| 172 |
+
self,
|
| 173 |
+
embedding: list[float],
|
| 174 |
+
k: int = None,
|
| 175 |
+
) -> float:
|
| 176 |
+
"""Compute novelty score for a new embedding (Section 4.2 algorithm).
|
| 177 |
+
|
| 178 |
+
Steps:
|
| 179 |
+
1. Retrieve k nearest neighbors
|
| 180 |
+
2. Compute mean distance (d_mean) and min distance (d_min)
|
| 181 |
+
3. novelty = 0.6 * d_mean + 0.4 * d_min
|
| 182 |
+
4. Normalize to [0, 1]
|
| 183 |
+
|
| 184 |
+
Args:
|
| 185 |
+
embedding: New signal embedding
|
| 186 |
+
k: Number of neighbors (default from config)
|
| 187 |
+
|
| 188 |
+
Returns:
|
| 189 |
+
Novelty score in [0, 1]
|
| 190 |
+
"""
|
| 191 |
+
if k is None:
|
| 192 |
+
k = config.NOVELTY_K_NEIGHBORS
|
| 193 |
+
|
| 194 |
+
count = self.get_collection_count()
|
| 195 |
+
if count == 0:
|
| 196 |
+
return 1.0 # First signal is maximally novel
|
| 197 |
+
|
| 198 |
+
# Adjust k if we have fewer points
|
| 199 |
+
actual_k = min(k, count)
|
| 200 |
+
|
| 201 |
+
response = self.client.query_points(
|
| 202 |
+
collection_name=self.collection_name,
|
| 203 |
+
query=embedding,
|
| 204 |
+
limit=actual_k,
|
| 205 |
+
with_payload=False,
|
| 206 |
+
)
|
| 207 |
+
results = response.points
|
| 208 |
+
|
| 209 |
+
if not results:
|
| 210 |
+
return 1.0
|
| 211 |
+
|
| 212 |
+
# Cosine distance = 1 - cosine_similarity. For unit-normalized embeddings
|
| 213 |
+
# this is in [0, 2] but in practice [0, ~0.6] dominates within a topic.
|
| 214 |
+
distances = [max(0.0, min(2.0, 1.0 - float(r.score))) for r in results]
|
| 215 |
+
|
| 216 |
+
d_mean = float(np.mean(distances))
|
| 217 |
+
d_min = float(np.min(distances))
|
| 218 |
+
|
| 219 |
+
# Weighted combination of mean+min distance (Section 4.2).
|
| 220 |
+
raw_novelty = (
|
| 221 |
+
config.NOVELTY_MEAN_WEIGHT * d_mean
|
| 222 |
+
+ config.NOVELTY_MIN_WEIGHT * d_min
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
# Map raw distance into [0, 1] via a sigmoid centred at d=0.30. Below
|
| 226 |
+
# ~0.05 distance the novelty drops near 0; above ~0.55 it approaches 1
|
| 227 |
+
# but does not saturate the entire batch like the previous 5x linear
|
| 228 |
+
# scale did.
|
| 229 |
+
center = 0.30
|
| 230 |
+
slope = 8.0
|
| 231 |
+
z = slope * (raw_novelty - center)
|
| 232 |
+
novelty = 1.0 / (1.0 + 2.718281828 ** (-z))
|
| 233 |
+
|
| 234 |
+
return round(float(min(1.0, max(0.0, novelty))), 4)
|
| 235 |
+
|
| 236 |
+
def get_all_payloads(self, limit: int = 1000) -> list[dict]:
|
| 237 |
+
"""Retrieve all stored payloads (for trend computation)."""
|
| 238 |
+
try:
|
| 239 |
+
results = self.client.scroll(
|
| 240 |
+
collection_name=self.collection_name,
|
| 241 |
+
limit=limit,
|
| 242 |
+
with_payload=True,
|
| 243 |
+
with_vectors=False,
|
| 244 |
+
)
|
| 245 |
+
return [
|
| 246 |
+
{"id": str(point.id), **point.payload}
|
| 247 |
+
for point in results[0]
|
| 248 |
+
]
|
| 249 |
+
except Exception as e:
|
| 250 |
+
logger.error(f"Failed to scroll payloads: {e}")
|
| 251 |
+
return []
|
| 252 |
+
|
| 253 |
+
def get_vectors_for_projection(self, limit: int = 500) -> tuple[list, list]:
|
| 254 |
+
"""Get vectors and payloads for 2D projection (t-SNE/UMAP viz).
|
| 255 |
+
|
| 256 |
+
Returns:
|
| 257 |
+
Tuple of (vectors, payloads)
|
| 258 |
+
"""
|
| 259 |
+
try:
|
| 260 |
+
results = self.client.scroll(
|
| 261 |
+
collection_name=self.collection_name,
|
| 262 |
+
limit=limit,
|
| 263 |
+
with_payload=True,
|
| 264 |
+
with_vectors=True,
|
| 265 |
+
)
|
| 266 |
+
vectors = [point.vector for point in results[0]]
|
| 267 |
+
payloads = [point.payload for point in results[0]]
|
| 268 |
+
return vectors, payloads
|
| 269 |
+
except Exception as e:
|
| 270 |
+
logger.error(f"Failed to get vectors for projection: {e}")
|
| 271 |
+
return [], []
|
| 272 |
+
def build_temporal_graph(self) -> dict:
|
| 273 |
+
"""Construct a graph of concepts tracking evolution over time.
|
| 274 |
+
|
| 275 |
+
Follows Section 5.4: Temporal graph construction.
|
| 276 |
+
"""
|
| 277 |
+
payloads = self.get_all_payloads()
|
| 278 |
+
if not payloads: return {}
|
| 279 |
+
|
| 280 |
+
# 1. Group by category/tag
|
| 281 |
+
graph = {}
|
| 282 |
+
for p in payloads:
|
| 283 |
+
tags = p.get("categories", [])
|
| 284 |
+
for tag in tags:
|
| 285 |
+
if tag not in graph: graph[tag] = []
|
| 286 |
+
graph[tag].append({
|
| 287 |
+
"id": p.get("id"),
|
| 288 |
+
"title": p.get("title"),
|
| 289 |
+
"timestamp": p.get("timestamp"),
|
| 290 |
+
"score": p.get("novelty_score", 0)
|
| 291 |
+
})
|
| 292 |
+
|
| 293 |
+
self.concept_graph = graph
|
| 294 |
+
logger.info(f"Temporal graph built with {len(graph)} concept nodes")
|
| 295 |
+
return graph
|
ingestion/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# VectorMinds Ingestion Package
|
ingestion/arxiv_crawler.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""arXiv Crawler — Ingests research papers from arXiv API.
|
| 2 |
+
|
| 3 |
+
Fetches recent papers by category, parses metadata, and returns
|
| 4 |
+
normalized ResearchSignal objects.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
from typing import Optional
|
| 12 |
+
|
| 13 |
+
import arxiv
|
| 14 |
+
import httpx
|
| 15 |
+
|
| 16 |
+
from ingestion.schema import ResearchSignal, SignalSource
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger("vectorminds.arxiv")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class ArxivCrawler:
|
| 22 |
+
"""Crawls arXiv for recent AI/ML papers."""
|
| 23 |
+
|
| 24 |
+
def __init__(self, categories: list[str], max_results: int = 50):
|
| 25 |
+
self.categories = categories
|
| 26 |
+
self.max_results = max_results
|
| 27 |
+
self.client = arxiv.Client()
|
| 28 |
+
|
| 29 |
+
async def fetch_recent_papers(
|
| 30 |
+
self,
|
| 31 |
+
query: Optional[str] = None,
|
| 32 |
+
category: Optional[str] = None,
|
| 33 |
+
max_results: Optional[int] = None,
|
| 34 |
+
) -> list[ResearchSignal]:
|
| 35 |
+
"""Fetch recent papers from arXiv.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
query: Optional search query (e.g. 'transformer attention')
|
| 39 |
+
category: Specific arXiv category (e.g. 'cs.LG')
|
| 40 |
+
max_results: Override default max results
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
List of ResearchSignal objects
|
| 44 |
+
"""
|
| 45 |
+
n = max_results or self.max_results
|
| 46 |
+
|
| 47 |
+
# Build search query
|
| 48 |
+
if query:
|
| 49 |
+
search_query = query
|
| 50 |
+
elif category:
|
| 51 |
+
search_query = f"cat:{category}"
|
| 52 |
+
else:
|
| 53 |
+
# Search across all configured categories
|
| 54 |
+
cat_query = " OR ".join(f"cat:{c}" for c in self.categories)
|
| 55 |
+
search_query = cat_query
|
| 56 |
+
|
| 57 |
+
logger.info(f"Fetching arXiv papers: query='{search_query}', max={n}")
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
search = arxiv.Search(
|
| 61 |
+
query=search_query,
|
| 62 |
+
max_results=n,
|
| 63 |
+
sort_by=arxiv.SortCriterion.SubmittedDate,
|
| 64 |
+
sort_order=arxiv.SortOrder.Descending,
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
signals = []
|
| 68 |
+
for result in self.client.results(search):
|
| 69 |
+
signal = ResearchSignal(
|
| 70 |
+
source=SignalSource.ARXIV,
|
| 71 |
+
source_id=result.entry_id,
|
| 72 |
+
timestamp=result.published or datetime.utcnow(),
|
| 73 |
+
title=result.title.strip().replace("\n", " "),
|
| 74 |
+
raw_text=result.summary.strip().replace("\n", " "),
|
| 75 |
+
authors=[a.name for a in result.authors],
|
| 76 |
+
categories=[c for c in result.categories],
|
| 77 |
+
url=result.entry_id,
|
| 78 |
+
metadata={
|
| 79 |
+
"pdf_url": result.pdf_url or "",
|
| 80 |
+
"primary_category": result.primary_category,
|
| 81 |
+
"comment": result.comment or "",
|
| 82 |
+
"journal_ref": result.journal_ref or "",
|
| 83 |
+
"doi": result.doi or "",
|
| 84 |
+
"updated": (
|
| 85 |
+
result.updated.isoformat() if result.updated else ""
|
| 86 |
+
),
|
| 87 |
+
},
|
| 88 |
+
)
|
| 89 |
+
signals.append(signal)
|
| 90 |
+
|
| 91 |
+
logger.info(f"Fetched {len(signals)} papers from arXiv")
|
| 92 |
+
return signals
|
| 93 |
+
|
| 94 |
+
except Exception as e:
|
| 95 |
+
logger.error(f"arXiv fetch failed: {e}")
|
| 96 |
+
return []
|
| 97 |
+
|
| 98 |
+
async def fetch_by_ids(self, paper_ids: list[str]) -> list[ResearchSignal]:
|
| 99 |
+
"""Fetch specific papers by their arXiv IDs."""
|
| 100 |
+
try:
|
| 101 |
+
search = arxiv.Search(id_list=paper_ids)
|
| 102 |
+
signals = []
|
| 103 |
+
for result in self.client.results(search):
|
| 104 |
+
signal = ResearchSignal(
|
| 105 |
+
source=SignalSource.ARXIV,
|
| 106 |
+
source_id=result.entry_id,
|
| 107 |
+
timestamp=result.published or datetime.utcnow(),
|
| 108 |
+
title=result.title.strip().replace("\n", " "),
|
| 109 |
+
raw_text=result.summary.strip().replace("\n", " "),
|
| 110 |
+
authors=[a.name for a in result.authors],
|
| 111 |
+
categories=list(result.categories),
|
| 112 |
+
url=result.entry_id,
|
| 113 |
+
metadata={
|
| 114 |
+
"pdf_url": result.pdf_url or "",
|
| 115 |
+
"primary_category": result.primary_category,
|
| 116 |
+
},
|
| 117 |
+
)
|
| 118 |
+
signals.append(signal)
|
| 119 |
+
return signals
|
| 120 |
+
except Exception as e:
|
| 121 |
+
logger.error(f"arXiv ID fetch failed: {e}")
|
| 122 |
+
return []
|
ingestion/blog_crawler.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Blog crawler using public RSS feeds from AI labs."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
|
| 8 |
+
import httpx
|
| 9 |
+
from bs4 import BeautifulSoup
|
| 10 |
+
|
| 11 |
+
from ingestion.schema import ResearchSignal, SignalSource
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger("vectorminds.blog")
|
| 14 |
+
|
| 15 |
+
FEEDS = [
|
| 16 |
+
"https://openai.com/blog/rss.xml",
|
| 17 |
+
"https://deepmind.google/discover/blog/rss.xml",
|
| 18 |
+
"https://huggingface.co/blog/feed.xml",
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class BlogCrawler:
|
| 23 |
+
async def fetch_blog_signals(self, max_results: int = 20) -> list[ResearchSignal]:
|
| 24 |
+
signals: list[ResearchSignal] = []
|
| 25 |
+
try:
|
| 26 |
+
async with httpx.AsyncClient(timeout=20.0, follow_redirects=True) as client:
|
| 27 |
+
for feed_url in FEEDS:
|
| 28 |
+
resp = await client.get(feed_url)
|
| 29 |
+
if resp.status_code != 200:
|
| 30 |
+
continue
|
| 31 |
+
soup = BeautifulSoup(resp.text, "xml")
|
| 32 |
+
for item in soup.find_all(["item", "entry"]):
|
| 33 |
+
title = (item.find("title").text if item.find("title") else "").strip()
|
| 34 |
+
summary_node = item.find("description") or item.find("summary")
|
| 35 |
+
summary = (
|
| 36 |
+
summary_node.text.strip() if summary_node and summary_node.text else ""
|
| 37 |
+
)
|
| 38 |
+
link_node = item.find("link")
|
| 39 |
+
url = ""
|
| 40 |
+
if link_node:
|
| 41 |
+
url = link_node.get("href", "") or link_node.text or ""
|
| 42 |
+
date_node = item.find("pubDate") or item.find("updated")
|
| 43 |
+
ts = datetime.utcnow()
|
| 44 |
+
if date_node and date_node.text:
|
| 45 |
+
try:
|
| 46 |
+
ts = datetime.strptime(
|
| 47 |
+
date_node.text[:25], "%a, %d %b %Y %H:%M:%S"
|
| 48 |
+
)
|
| 49 |
+
except Exception:
|
| 50 |
+
ts = datetime.utcnow()
|
| 51 |
+
|
| 52 |
+
if not title:
|
| 53 |
+
continue
|
| 54 |
+
signals.append(
|
| 55 |
+
ResearchSignal(
|
| 56 |
+
source=SignalSource.BLOG,
|
| 57 |
+
source_id=f"blog:{abs(hash(url or title))}",
|
| 58 |
+
timestamp=ts,
|
| 59 |
+
title=title,
|
| 60 |
+
raw_text=summary or title,
|
| 61 |
+
authors=["AI Blog"],
|
| 62 |
+
categories=["blog", "ai"],
|
| 63 |
+
url=url,
|
| 64 |
+
metadata={"feed_url": feed_url, "source_system": "rss"},
|
| 65 |
+
)
|
| 66 |
+
)
|
| 67 |
+
if len(signals) >= max_results:
|
| 68 |
+
return signals
|
| 69 |
+
return signals
|
| 70 |
+
except Exception as e:
|
| 71 |
+
logger.error(f"Blog fetch failed: {e}")
|
| 72 |
+
return []
|
ingestion/github_crawler.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""GitHub Crawler — Discovers trending ML/AI repositories.
|
| 2 |
+
|
| 3 |
+
Uses the GitHub REST API (unauthenticated / free) to find repositories
|
| 4 |
+
with high recent activity in machine learning topics.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
from datetime import datetime, timedelta
|
| 11 |
+
from typing import Optional
|
| 12 |
+
|
| 13 |
+
import httpx
|
| 14 |
+
|
| 15 |
+
from ingestion.schema import ResearchSignal, SignalSource
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger("vectorminds.github")
|
| 18 |
+
|
| 19 |
+
GITHUB_API = "https://api.github.com"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class GitHubCrawler:
|
| 23 |
+
"""Crawls GitHub for trending ML/AI repositories."""
|
| 24 |
+
|
| 25 |
+
def __init__(
|
| 26 |
+
self,
|
| 27 |
+
languages: list[str] | None = None,
|
| 28 |
+
max_results: int = 30,
|
| 29 |
+
token: str = "",
|
| 30 |
+
):
|
| 31 |
+
self.languages = languages or ["python"]
|
| 32 |
+
self.max_results = max_results
|
| 33 |
+
headers = {
|
| 34 |
+
"Accept": "application/vnd.github.v3+json",
|
| 35 |
+
"User-Agent": "VectorMinds-Research-Intelligence",
|
| 36 |
+
}
|
| 37 |
+
if token:
|
| 38 |
+
headers["Authorization"] = f"token {token}"
|
| 39 |
+
self.headers = headers
|
| 40 |
+
|
| 41 |
+
async def fetch_trending_repos(
|
| 42 |
+
self,
|
| 43 |
+
topic: str = "machine-learning",
|
| 44 |
+
days_back: int = 7,
|
| 45 |
+
max_results: Optional[int] = None,
|
| 46 |
+
) -> list[ResearchSignal]:
|
| 47 |
+
"""Fetch trending ML repos from GitHub.
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
topic: GitHub topic to search (e.g. 'machine-learning', 'deep-learning')
|
| 51 |
+
days_back: Look back window in days
|
| 52 |
+
max_results: Override default max results
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
List of ResearchSignal objects
|
| 56 |
+
"""
|
| 57 |
+
n = max_results or self.max_results
|
| 58 |
+
since_date = (datetime.utcnow() - timedelta(days=days_back)).strftime(
|
| 59 |
+
"%Y-%m-%d"
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
# Build search query for trending ML repos
|
| 63 |
+
lang_query = " ".join(f"language:{l}" for l in self.languages)
|
| 64 |
+
query = f"topic:{topic} {lang_query} created:>{since_date} stars:>5"
|
| 65 |
+
|
| 66 |
+
logger.info(f"Fetching GitHub repos: query='{query}', max={n}")
|
| 67 |
+
|
| 68 |
+
try:
|
| 69 |
+
async with httpx.AsyncClient(timeout=30.0) as client:
|
| 70 |
+
resp = await client.get(
|
| 71 |
+
f"{GITHUB_API}/search/repositories",
|
| 72 |
+
params={
|
| 73 |
+
"q": query,
|
| 74 |
+
"sort": "stars",
|
| 75 |
+
"order": "desc",
|
| 76 |
+
"per_page": min(n, 100),
|
| 77 |
+
},
|
| 78 |
+
headers=self.headers,
|
| 79 |
+
)
|
| 80 |
+
resp.raise_for_status()
|
| 81 |
+
data = resp.json()
|
| 82 |
+
|
| 83 |
+
signals = []
|
| 84 |
+
for repo in data.get("items", [])[:n]:
|
| 85 |
+
# Compute stars-per-day acceleration
|
| 86 |
+
created = datetime.strptime(
|
| 87 |
+
repo["created_at"], "%Y-%m-%dT%H:%M:%SZ"
|
| 88 |
+
)
|
| 89 |
+
age_days = max((datetime.utcnow() - created).days, 1)
|
| 90 |
+
stars_per_day = repo.get("stargazers_count", 0) / age_days
|
| 91 |
+
|
| 92 |
+
signal = ResearchSignal(
|
| 93 |
+
source=SignalSource.GITHUB,
|
| 94 |
+
source_id=repo["full_name"],
|
| 95 |
+
timestamp=datetime.strptime(
|
| 96 |
+
repo["updated_at"], "%Y-%m-%dT%H:%M:%SZ"
|
| 97 |
+
),
|
| 98 |
+
title=repo["full_name"],
|
| 99 |
+
raw_text=repo.get("description", "") or "",
|
| 100 |
+
authors=[repo["owner"]["login"]],
|
| 101 |
+
categories=repo.get("topics", []) or [],
|
| 102 |
+
url=repo["html_url"],
|
| 103 |
+
metadata={
|
| 104 |
+
"stars": repo.get("stargazers_count", 0),
|
| 105 |
+
"forks": repo.get("forks_count", 0),
|
| 106 |
+
"watchers": repo.get("watchers_count", 0),
|
| 107 |
+
"language": repo.get("language", ""),
|
| 108 |
+
"open_issues": repo.get("open_issues_count", 0),
|
| 109 |
+
"stars_per_day": round(stars_per_day, 2),
|
| 110 |
+
"license": (
|
| 111 |
+
repo.get("license", {}) or {}
|
| 112 |
+
).get("spdx_id", ""),
|
| 113 |
+
"size_kb": repo.get("size", 0),
|
| 114 |
+
"created_at": repo["created_at"],
|
| 115 |
+
},
|
| 116 |
+
)
|
| 117 |
+
signals.append(signal)
|
| 118 |
+
|
| 119 |
+
logger.info(f"Fetched {len(signals)} repos from GitHub")
|
| 120 |
+
return signals
|
| 121 |
+
|
| 122 |
+
except Exception as e:
|
| 123 |
+
logger.error(f"GitHub fetch failed: {e}")
|
| 124 |
+
return []
|
| 125 |
+
|
| 126 |
+
async def fetch_repo_details(self, full_name: str) -> Optional[ResearchSignal]:
|
| 127 |
+
"""Fetch details for a specific repository."""
|
| 128 |
+
try:
|
| 129 |
+
async with httpx.AsyncClient(timeout=15.0) as client:
|
| 130 |
+
resp = await client.get(
|
| 131 |
+
f"{GITHUB_API}/repos/{full_name}",
|
| 132 |
+
headers=self.headers,
|
| 133 |
+
)
|
| 134 |
+
resp.raise_for_status()
|
| 135 |
+
repo = resp.json()
|
| 136 |
+
|
| 137 |
+
created = datetime.strptime(repo["created_at"], "%Y-%m-%dT%H:%M:%SZ")
|
| 138 |
+
age_days = max((datetime.utcnow() - created).days, 1)
|
| 139 |
+
stars_per_day = repo.get("stargazers_count", 0) / age_days
|
| 140 |
+
|
| 141 |
+
return ResearchSignal(
|
| 142 |
+
source=SignalSource.GITHUB,
|
| 143 |
+
source_id=repo["full_name"],
|
| 144 |
+
timestamp=datetime.strptime(
|
| 145 |
+
repo["updated_at"], "%Y-%m-%dT%H:%M:%SZ"
|
| 146 |
+
),
|
| 147 |
+
title=repo["full_name"],
|
| 148 |
+
raw_text=repo.get("description", "") or "",
|
| 149 |
+
authors=[repo["owner"]["login"]],
|
| 150 |
+
categories=repo.get("topics", []) or [],
|
| 151 |
+
url=repo["html_url"],
|
| 152 |
+
metadata={
|
| 153 |
+
"stars": repo.get("stargazers_count", 0),
|
| 154 |
+
"forks": repo.get("forks_count", 0),
|
| 155 |
+
"stars_per_day": round(stars_per_day, 2),
|
| 156 |
+
"language": repo.get("language", ""),
|
| 157 |
+
},
|
| 158 |
+
)
|
| 159 |
+
except Exception as e:
|
| 160 |
+
logger.error(f"GitHub repo fetch failed: {e}")
|
| 161 |
+
return None
|
ingestion/patent_crawler.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Patent crawler using PatentsView public API."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
|
| 8 |
+
import httpx
|
| 9 |
+
|
| 10 |
+
from ingestion.schema import ResearchSignal, SignalSource
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger("vectorminds.patents")
|
| 13 |
+
|
| 14 |
+
PATENTSVIEW_URL = "https://api.patentsview.org/patents/query"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class PatentCrawler:
|
| 18 |
+
"""Fetches recent AI-related patents from public PatentsView API."""
|
| 19 |
+
|
| 20 |
+
async def fetch_recent_patents(self, max_results: int = 20) -> list[ResearchSignal]:
|
| 21 |
+
query = {
|
| 22 |
+
"_or": [
|
| 23 |
+
{"_text_any": {"patent_title": "artificial intelligence"}},
|
| 24 |
+
{"_text_any": {"patent_title": "machine learning"}},
|
| 25 |
+
{"_text_any": {"patent_abstract": "neural network"}},
|
| 26 |
+
{"_text_any": {"patent_abstract": "transformer model"}},
|
| 27 |
+
]
|
| 28 |
+
}
|
| 29 |
+
fields = [
|
| 30 |
+
"patent_number",
|
| 31 |
+
"patent_title",
|
| 32 |
+
"patent_date",
|
| 33 |
+
"patent_abstract",
|
| 34 |
+
"patent_type",
|
| 35 |
+
"assignee_organization",
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
async with httpx.AsyncClient(timeout=25.0) as client:
|
| 40 |
+
resp = await client.post(
|
| 41 |
+
PATENTSVIEW_URL,
|
| 42 |
+
json={
|
| 43 |
+
"q": query,
|
| 44 |
+
"f": fields,
|
| 45 |
+
"o": {"per_page": max_results},
|
| 46 |
+
},
|
| 47 |
+
)
|
| 48 |
+
resp.raise_for_status()
|
| 49 |
+
data = resp.json()
|
| 50 |
+
except Exception as e:
|
| 51 |
+
logger.error(f"PatentsView fetch failed: {e}")
|
| 52 |
+
return []
|
| 53 |
+
|
| 54 |
+
patents = data.get("patents", [])
|
| 55 |
+
signals: list[ResearchSignal] = []
|
| 56 |
+
for p in patents:
|
| 57 |
+
patent_number = p.get("patent_number", "")
|
| 58 |
+
title = p.get("patent_title", "") or ""
|
| 59 |
+
abstract = p.get("patent_abstract", "") or ""
|
| 60 |
+
assignees = p.get("assignees", []) or []
|
| 61 |
+
orgs = [
|
| 62 |
+
a.get("assignee_organization", "")
|
| 63 |
+
for a in assignees
|
| 64 |
+
if isinstance(a, dict)
|
| 65 |
+
]
|
| 66 |
+
patent_date = p.get("patent_date", "")
|
| 67 |
+
try:
|
| 68 |
+
ts = datetime.strptime(patent_date, "%Y-%m-%d")
|
| 69 |
+
except Exception:
|
| 70 |
+
ts = datetime.utcnow()
|
| 71 |
+
|
| 72 |
+
signals.append(
|
| 73 |
+
ResearchSignal(
|
| 74 |
+
source=SignalSource.PATENT,
|
| 75 |
+
source_id=patent_number or title[:64],
|
| 76 |
+
timestamp=ts,
|
| 77 |
+
title=title or f"Patent {patent_number}",
|
| 78 |
+
raw_text=abstract[:4000],
|
| 79 |
+
authors=orgs[:5],
|
| 80 |
+
categories=["patent", "ai"],
|
| 81 |
+
url=(
|
| 82 |
+
f"https://patents.google.com/patent/{patent_number}"
|
| 83 |
+
if patent_number
|
| 84 |
+
else ""
|
| 85 |
+
),
|
| 86 |
+
metadata={
|
| 87 |
+
"patent_number": patent_number,
|
| 88 |
+
"patent_type": p.get("patent_type", ""),
|
| 89 |
+
"assignees": orgs[:10],
|
| 90 |
+
"source_system": "patentsview",
|
| 91 |
+
},
|
| 92 |
+
)
|
| 93 |
+
)
|
| 94 |
+
return signals
|
ingestion/pdf_parser.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""PDF Parser — Section-Aware Research Document Processing.
|
| 2 |
+
|
| 3 |
+
Extracts structured text from research PDFs and performs hierarchical
|
| 4 |
+
chunking for granular vector indexing.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import re
|
| 8 |
+
import logging
|
| 9 |
+
from typing import Dict, List
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger("vectormind.ingestion")
|
| 12 |
+
|
| 13 |
+
class PDFParser:
|
| 14 |
+
"""Parses research PDFs into structured sections and chunks."""
|
| 15 |
+
|
| 16 |
+
def __init__(self):
|
| 17 |
+
# Section header patterns
|
| 18 |
+
self.section_patterns = {
|
| 19 |
+
"abstract": re.compile(r"^abstract", re.IGNORECASE),
|
| 20 |
+
"introduction": re.compile(r"^1\.?\s+introduction|^introduction", re.IGNORECASE),
|
| 21 |
+
"methodology": re.compile(r"^2\.?\s+methods|^methods|^methodology", re.IGNORECASE),
|
| 22 |
+
"results": re.compile(r"^3\.?\s+results|^results", re.IGNORECASE),
|
| 23 |
+
"discussion": re.compile(r"^4\.?\s+discussion|^discussion", re.IGNORECASE),
|
| 24 |
+
"conclusion": re.compile(r"^5\.?\s+conclusion|^conclusion", re.IGNORECASE),
|
| 25 |
+
"references": re.compile(r"^references", re.IGNORECASE),
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
def parse_text(self, text: str) -> Dict[str, str]:
|
| 29 |
+
"""Parse raw text into sections based on headers."""
|
| 30 |
+
sections = {"abstract": "", "main_text": "", "full_text": text}
|
| 31 |
+
lines = text.split('\n')
|
| 32 |
+
|
| 33 |
+
current_section = "abstract"
|
| 34 |
+
|
| 35 |
+
for line in lines:
|
| 36 |
+
line_clean = line.strip()
|
| 37 |
+
if not line_clean:
|
| 38 |
+
continue
|
| 39 |
+
|
| 40 |
+
# Check for section transition
|
| 41 |
+
found_header = False
|
| 42 |
+
for section_name, pattern in self.section_patterns.items():
|
| 43 |
+
if pattern.match(line_clean):
|
| 44 |
+
current_section = section_name
|
| 45 |
+
found_header = True
|
| 46 |
+
break
|
| 47 |
+
|
| 48 |
+
if not found_header:
|
| 49 |
+
if current_section in sections:
|
| 50 |
+
sections[current_section] += line + "\n"
|
| 51 |
+
else:
|
| 52 |
+
sections[current_section] = line + "\n"
|
| 53 |
+
|
| 54 |
+
return sections
|
| 55 |
+
|
| 56 |
+
def hierarchical_chunking(self, sections: Dict[str, str], chunk_size: int = 1000) -> List[Dict]:
|
| 57 |
+
"""Create chunks at different granularity levels."""
|
| 58 |
+
chunks = []
|
| 59 |
+
|
| 60 |
+
# 1. Abstract level (high-level)
|
| 61 |
+
if sections.get("abstract"):
|
| 62 |
+
chunks.append({
|
| 63 |
+
"level": "abstract",
|
| 64 |
+
"text": sections["abstract"],
|
| 65 |
+
"metadata": {"type": "summary"}
|
| 66 |
+
})
|
| 67 |
+
|
| 68 |
+
# 2. Section level (paragraph-aware)
|
| 69 |
+
for name, content in sections.items():
|
| 70 |
+
if name in ["full_text", "references"]: continue
|
| 71 |
+
|
| 72 |
+
# Simple paragraph splitting
|
| 73 |
+
paragraphs = content.split('\n\n')
|
| 74 |
+
for i, p in enumerate(paragraphs):
|
| 75 |
+
if len(p.strip()) < 50: continue
|
| 76 |
+
chunks.append({
|
| 77 |
+
"level": "section",
|
| 78 |
+
"section": name,
|
| 79 |
+
"text": p.strip(),
|
| 80 |
+
"metadata": {"para_idx": i}
|
| 81 |
+
})
|
| 82 |
+
|
| 83 |
+
return chunks
|
ingestion/schema.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Research Signal canonical schema.
|
| 2 |
+
|
| 3 |
+
All ingested data is normalized into this schema regardless of source.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import uuid
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from enum import Enum
|
| 11 |
+
from typing import Optional
|
| 12 |
+
|
| 13 |
+
from pydantic import BaseModel, Field
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class SignalSource(str, Enum):
|
| 17 |
+
ARXIV = "arxiv"
|
| 18 |
+
GITHUB = "github"
|
| 19 |
+
PATENT = "patent"
|
| 20 |
+
STARTUP = "startup"
|
| 21 |
+
SOCIAL = "social"
|
| 22 |
+
BLOG = "blog"
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class ResearchSignal(BaseModel):
|
| 26 |
+
"""Canonical Research Signal — the universal data unit in VectorMinds."""
|
| 27 |
+
|
| 28 |
+
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
| 29 |
+
source: SignalSource
|
| 30 |
+
source_id: str = "" # e.g. arXiv paper ID, GitHub repo full_name
|
| 31 |
+
timestamp: datetime = Field(default_factory=datetime.utcnow)
|
| 32 |
+
title: str
|
| 33 |
+
raw_text: str # abstract or description
|
| 34 |
+
authors: list[str] = Field(default_factory=list)
|
| 35 |
+
categories: list[str] = Field(default_factory=list)
|
| 36 |
+
url: str = ""
|
| 37 |
+
embedding: list[float] = Field(default_factory=list)
|
| 38 |
+
novelty_score: float = 0.0
|
| 39 |
+
impact_score: float = 0.0
|
| 40 |
+
metadata: dict = Field(default_factory=dict)
|
| 41 |
+
|
| 42 |
+
# Computed fields (populated by Reasoning Agent)
|
| 43 |
+
technique_name: str = ""
|
| 44 |
+
technical_brief: str = ""
|
| 45 |
+
cross_source_signals: dict = Field(default_factory=dict)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class TrendEntry(BaseModel):
|
| 49 |
+
"""A ranked entry in the Trend Leaderboard."""
|
| 50 |
+
|
| 51 |
+
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
| 52 |
+
rank: int = 0
|
| 53 |
+
technique_name: str
|
| 54 |
+
description: str = ""
|
| 55 |
+
emergence_score: float = 0.0
|
| 56 |
+
novelty_score: float = 0.0
|
| 57 |
+
impact_score: float = 0.0
|
| 58 |
+
mainstream_eta_months: int = 12
|
| 59 |
+
confidence: float = 0.0
|
| 60 |
+
source_signals: dict = Field(default_factory=dict)
|
| 61 |
+
competitive_landscape: list[str] = Field(default_factory=list)
|
| 62 |
+
risk_factors: list[str] = Field(default_factory=list)
|
| 63 |
+
related_techniques: list[str] = Field(default_factory=list)
|
| 64 |
+
paper_count: int = 0
|
| 65 |
+
github_stars: int = 0
|
| 66 |
+
first_seen: datetime = Field(default_factory=datetime.utcnow)
|
| 67 |
+
last_updated: datetime = Field(default_factory=datetime.utcnow)
|
| 68 |
+
signal_ids: list[str] = Field(default_factory=list)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class ProductBlueprint(BaseModel):
|
| 72 |
+
"""A complete product blueprint generated from a high-scoring technique."""
|
| 73 |
+
|
| 74 |
+
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
| 75 |
+
technique_name: str
|
| 76 |
+
trend_id: str = ""
|
| 77 |
+
created_at: datetime = Field(default_factory=datetime.utcnow)
|
| 78 |
+
|
| 79 |
+
problem_statement: str = ""
|
| 80 |
+
market_size: str = ""
|
| 81 |
+
technical_implementation: str = ""
|
| 82 |
+
architecture_decisions: list[str] = Field(default_factory=list)
|
| 83 |
+
differentiation_strategy: str = ""
|
| 84 |
+
dataset_requirements: str = ""
|
| 85 |
+
go_to_market: str = ""
|
| 86 |
+
risk_assessment: str = ""
|
| 87 |
+
first_90_day_milestones: list[str] = Field(default_factory=list)
|
| 88 |
+
suggested_stack: list[str] = Field(default_factory=list)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
class MLPipeline(BaseModel):
|
| 92 |
+
"""A generated ML training pipeline."""
|
| 93 |
+
|
| 94 |
+
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
| 95 |
+
technique_name: str
|
| 96 |
+
blueprint_id: str = ""
|
| 97 |
+
task_type: str = "" # one of SUPPORTED_TASK_CATEGORIES
|
| 98 |
+
created_at: datetime = Field(default_factory=datetime.utcnow)
|
| 99 |
+
|
| 100 |
+
dataset_name: str = ""
|
| 101 |
+
dataset_source: str = ""
|
| 102 |
+
model_architecture: str = ""
|
| 103 |
+
notebook_content: str = ""
|
| 104 |
+
colab_url: str = ""
|
| 105 |
+
status: str = "generated" # generated, training, completed, failed
|
| 106 |
+
metrics: dict = Field(default_factory=dict)
|
| 107 |
+
model_card: str = ""
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
class AgentEvent(BaseModel):
|
| 111 |
+
"""Event message passed between agents via the message bus."""
|
| 112 |
+
|
| 113 |
+
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
| 114 |
+
topic: str
|
| 115 |
+
source_agent: str
|
| 116 |
+
timestamp: datetime = Field(default_factory=datetime.utcnow)
|
| 117 |
+
payload: dict = Field(default_factory=dict)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
class UserFeedback(BaseModel):
|
| 121 |
+
"""User feedback on a prediction or blueprint."""
|
| 122 |
+
|
| 123 |
+
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
| 124 |
+
target_id: str # trend_id or blueprint_id
|
| 125 |
+
target_type: str # "trend" or "blueprint"
|
| 126 |
+
action: str # "upvote" or "downvote"
|
| 127 |
+
timestamp: datetime = Field(default_factory=datetime.utcnow)
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
class PlatformStats(BaseModel):
|
| 131 |
+
"""Live platform statistics for the dashboard."""
|
| 132 |
+
|
| 133 |
+
total_papers: int = 0
|
| 134 |
+
total_github_repos: int = 0
|
| 135 |
+
total_signals: int = 0
|
| 136 |
+
active_trends: int = 0
|
| 137 |
+
blueprints_generated: int = 0
|
| 138 |
+
pipelines_launched: int = 0
|
| 139 |
+
avg_novelty_score: float = 0.0
|
| 140 |
+
novelty_distribution: list[float] = Field(default_factory=list)
|
| 141 |
+
agents_status: dict = Field(default_factory=dict)
|
| 142 |
+
last_ingestion: Optional[datetime] = None
|
ingestion/social_crawler.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Social signal crawler (Hacker News public API)."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
|
| 8 |
+
import httpx
|
| 9 |
+
|
| 10 |
+
from ingestion.schema import ResearchSignal, SignalSource
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger("vectorminds.social")
|
| 13 |
+
|
| 14 |
+
HN_TOP = "https://hacker-news.firebaseio.com/v0/topstories.json"
|
| 15 |
+
HN_ITEM = "https://hacker-news.firebaseio.com/v0/item/{item_id}.json"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class SocialCrawler:
|
| 19 |
+
async def fetch_hn_signals(self, max_results: int = 30) -> list[ResearchSignal]:
|
| 20 |
+
try:
|
| 21 |
+
async with httpx.AsyncClient(timeout=25.0) as client:
|
| 22 |
+
ids_resp = await client.get(HN_TOP)
|
| 23 |
+
ids_resp.raise_for_status()
|
| 24 |
+
top_ids = (ids_resp.json() or [])[: max_results * 2]
|
| 25 |
+
|
| 26 |
+
signals: list[ResearchSignal] = []
|
| 27 |
+
for item_id in top_ids:
|
| 28 |
+
item_resp = await client.get(HN_ITEM.format(item_id=item_id))
|
| 29 |
+
if item_resp.status_code != 200:
|
| 30 |
+
continue
|
| 31 |
+
item = item_resp.json() or {}
|
| 32 |
+
if item.get("type") != "story":
|
| 33 |
+
continue
|
| 34 |
+
title = (item.get("title") or "").lower()
|
| 35 |
+
text = (item.get("text") or "").lower()
|
| 36 |
+
if not any(
|
| 37 |
+
k in (title + " " + text)
|
| 38 |
+
for k in ("ai", "llm", "machine learning", "transformer", "agent")
|
| 39 |
+
):
|
| 40 |
+
continue
|
| 41 |
+
|
| 42 |
+
ts = datetime.utcfromtimestamp(item.get("time", 0) or 0)
|
| 43 |
+
signals.append(
|
| 44 |
+
ResearchSignal(
|
| 45 |
+
source=SignalSource.SOCIAL,
|
| 46 |
+
source_id=f"HN-{item_id}",
|
| 47 |
+
timestamp=ts if ts.year > 2000 else datetime.utcnow(),
|
| 48 |
+
title=item.get("title", "HN story"),
|
| 49 |
+
raw_text=item.get("text", "") or item.get("title", ""),
|
| 50 |
+
authors=[item.get("by", "hn_user")],
|
| 51 |
+
categories=["hacker-news", "social"],
|
| 52 |
+
url=item.get("url", f"https://news.ycombinator.com/item?id={item_id}"),
|
| 53 |
+
metadata={
|
| 54 |
+
"hn_id": item_id,
|
| 55 |
+
"score": item.get("score", 0),
|
| 56 |
+
"descendants": item.get("descendants", 0),
|
| 57 |
+
"source_system": "hackernews",
|
| 58 |
+
},
|
| 59 |
+
)
|
| 60 |
+
)
|
| 61 |
+
if len(signals) >= max_results:
|
| 62 |
+
break
|
| 63 |
+
return signals
|
| 64 |
+
except Exception as e:
|
| 65 |
+
logger.error(f"HN fetch failed: {e}")
|
| 66 |
+
return []
|
ingestion/startup_crawler.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Startup signal crawler using public startup/news RSS feeds."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
|
| 8 |
+
import httpx
|
| 9 |
+
from bs4 import BeautifulSoup
|
| 10 |
+
|
| 11 |
+
from ingestion.schema import ResearchSignal, SignalSource
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger("vectorminds.startups")
|
| 14 |
+
|
| 15 |
+
STARTUP_FEEDS = [
|
| 16 |
+
"https://techcrunch.com/category/startups/feed/",
|
| 17 |
+
"https://www.ycombinator.com/blog/rss/",
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class StartupCrawler:
|
| 22 |
+
async def fetch_startup_signals(self, max_results: int = 20) -> list[ResearchSignal]:
|
| 23 |
+
signals: list[ResearchSignal] = []
|
| 24 |
+
try:
|
| 25 |
+
async with httpx.AsyncClient(timeout=20.0, follow_redirects=True) as client:
|
| 26 |
+
for feed_url in STARTUP_FEEDS:
|
| 27 |
+
resp = await client.get(feed_url)
|
| 28 |
+
if resp.status_code != 200:
|
| 29 |
+
continue
|
| 30 |
+
soup = BeautifulSoup(resp.text, "xml")
|
| 31 |
+
for item in soup.find_all(["item", "entry"]):
|
| 32 |
+
title = (item.find("title").text if item.find("title") else "").strip()
|
| 33 |
+
if not title:
|
| 34 |
+
continue
|
| 35 |
+
text = (
|
| 36 |
+
(item.find("description").text if item.find("description") else "")
|
| 37 |
+
.strip()
|
| 38 |
+
.lower()
|
| 39 |
+
)
|
| 40 |
+
if not any(
|
| 41 |
+
k in (title.lower() + " " + text)
|
| 42 |
+
for k in ("ai", "machine learning", "llm", "model")
|
| 43 |
+
):
|
| 44 |
+
continue
|
| 45 |
+
|
| 46 |
+
link_node = item.find("link")
|
| 47 |
+
url = ""
|
| 48 |
+
if link_node:
|
| 49 |
+
url = link_node.get("href", "") or link_node.text or ""
|
| 50 |
+
|
| 51 |
+
signals.append(
|
| 52 |
+
ResearchSignal(
|
| 53 |
+
source=SignalSource.STARTUP,
|
| 54 |
+
source_id=f"startup:{abs(hash(url or title))}",
|
| 55 |
+
timestamp=datetime.utcnow(),
|
| 56 |
+
title=title,
|
| 57 |
+
raw_text=text or title,
|
| 58 |
+
authors=["startup-news"],
|
| 59 |
+
categories=["startup", "funding"],
|
| 60 |
+
url=url,
|
| 61 |
+
metadata={"feed_url": feed_url, "source_system": "rss"},
|
| 62 |
+
)
|
| 63 |
+
)
|
| 64 |
+
if len(signals) >= max_results:
|
| 65 |
+
return signals
|
| 66 |
+
return signals
|
| 67 |
+
except Exception as e:
|
| 68 |
+
logger.error(f"Startup feed fetch failed: {e}")
|
| 69 |
+
return []
|
intelligence/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# VectorMinds Intelligence Package
|
intelligence/blueprint_engine.py
ADDED
|
@@ -0,0 +1,358 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Product Blueprint Engine — Generates startup-ready product briefs.
|
| 2 |
+
|
| 3 |
+
Takes a high-scoring technique and generates a complete product blueprint
|
| 4 |
+
using LLM (Gemini) or mock data for demo.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import asyncio
|
| 10 |
+
import json
|
| 11 |
+
import logging
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from typing import Optional
|
| 14 |
+
|
| 15 |
+
import httpx
|
| 16 |
+
|
| 17 |
+
from ingestion.schema import ProductBlueprint, TrendEntry
|
| 18 |
+
import config
|
| 19 |
+
|
| 20 |
+
logger = logging.getLogger("vectorminds.blueprint")
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _coerce_text(value, indent: int = 0) -> str:
|
| 24 |
+
"""Coerce a Gemini value (string|list|dict|number|None) into a readable string.
|
| 25 |
+
|
| 26 |
+
Lists become bullet lines; dicts become "Key:\\n - item" sections. Used because
|
| 27 |
+
the LLM sometimes returns nested objects for fields the schema types as ``str``.
|
| 28 |
+
"""
|
| 29 |
+
if value is None:
|
| 30 |
+
return ""
|
| 31 |
+
if isinstance(value, str):
|
| 32 |
+
return value
|
| 33 |
+
if isinstance(value, (int, float, bool)):
|
| 34 |
+
return str(value)
|
| 35 |
+
pad = " " * indent
|
| 36 |
+
if isinstance(value, list):
|
| 37 |
+
return "\n".join(f"{pad}- {_coerce_text(v, indent + 1).lstrip()}" for v in value)
|
| 38 |
+
if isinstance(value, dict):
|
| 39 |
+
chunks = []
|
| 40 |
+
for k, v in value.items():
|
| 41 |
+
label = str(k).replace("_", " ").strip()
|
| 42 |
+
inner = _coerce_text(v, indent + 1)
|
| 43 |
+
if "\n" in inner or (isinstance(v, (list, dict))):
|
| 44 |
+
chunks.append(f"{pad}{label}:\n{inner}")
|
| 45 |
+
else:
|
| 46 |
+
chunks.append(f"{pad}{label}: {inner}")
|
| 47 |
+
return "\n".join(chunks)
|
| 48 |
+
return str(value)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _coerce_str_list(value) -> list[str]:
|
| 52 |
+
"""Force a value into a list[str], coercing nested dicts/lists to readable lines."""
|
| 53 |
+
if value is None:
|
| 54 |
+
return []
|
| 55 |
+
if isinstance(value, list):
|
| 56 |
+
return [_coerce_text(v).strip() for v in value if v is not None]
|
| 57 |
+
if isinstance(value, dict):
|
| 58 |
+
return [f"{k}: {_coerce_text(v).strip()}" for k, v in value.items()]
|
| 59 |
+
return [str(value)]
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _repair_truncated_json(text: str) -> Optional[dict]:
|
| 63 |
+
"""Best-effort recovery of a truncated JSON object from a Gemini response.
|
| 64 |
+
|
| 65 |
+
Closes any open string, then closes any unbalanced ``[`` / ``{`` brackets,
|
| 66 |
+
in stack order. Returns ``None`` if the result still does not parse.
|
| 67 |
+
"""
|
| 68 |
+
if not text:
|
| 69 |
+
return None
|
| 70 |
+
s = text.strip()
|
| 71 |
+
if not s.startswith("{"):
|
| 72 |
+
start = s.find("{")
|
| 73 |
+
if start == -1:
|
| 74 |
+
return None
|
| 75 |
+
s = s[start:]
|
| 76 |
+
|
| 77 |
+
in_str = False
|
| 78 |
+
escape = False
|
| 79 |
+
stack: list[str] = []
|
| 80 |
+
for ch in s:
|
| 81 |
+
if escape:
|
| 82 |
+
escape = False
|
| 83 |
+
continue
|
| 84 |
+
if ch == "\\":
|
| 85 |
+
escape = True
|
| 86 |
+
continue
|
| 87 |
+
if ch == '"':
|
| 88 |
+
in_str = not in_str
|
| 89 |
+
continue
|
| 90 |
+
if in_str:
|
| 91 |
+
continue
|
| 92 |
+
if ch in "{[":
|
| 93 |
+
stack.append("}" if ch == "{" else "]")
|
| 94 |
+
elif ch in "}]" and stack and stack[-1] == ch:
|
| 95 |
+
stack.pop()
|
| 96 |
+
|
| 97 |
+
repaired = s
|
| 98 |
+
if in_str:
|
| 99 |
+
repaired += '"'
|
| 100 |
+
while stack:
|
| 101 |
+
repaired += stack.pop()
|
| 102 |
+
|
| 103 |
+
repaired = repaired.rstrip(",")
|
| 104 |
+
try:
|
| 105 |
+
return json.loads(repaired)
|
| 106 |
+
except Exception:
|
| 107 |
+
return None
|
| 108 |
+
|
| 109 |
+
# ─── Mock Blueprints for Demo ─────────────────────────────────
|
| 110 |
+
MOCK_BLUEPRINTS = {
|
| 111 |
+
"default": {
|
| 112 |
+
"problem_statement": (
|
| 113 |
+
"Enterprise organizations process millions of documents daily — contracts, "
|
| 114 |
+
"reports, emails, compliance filings — but existing NLP solutions are limited "
|
| 115 |
+
"by transformer context windows (typically 4K-8K tokens). Documents exceeding "
|
| 116 |
+
"this limit require chunking strategies that lose cross-section context, "
|
| 117 |
+
"resulting in 15-30% accuracy degradation on long-document tasks. The market "
|
| 118 |
+
"for intelligent document processing is $4.2B (2024) growing at 28% CAGR."
|
| 119 |
+
),
|
| 120 |
+
"market_size": "$4.2B (2024), projected $12.1B by 2028 at 28% CAGR",
|
| 121 |
+
"technical_implementation": (
|
| 122 |
+
"Build a document intelligence API powered by the selected technique. "
|
| 123 |
+
"Architecture: (1) Document ingestion service with OCR and layout detection, "
|
| 124 |
+
"(2) Adaptive chunking engine that preserves cross-reference context, "
|
| 125 |
+
"(3) Core inference engine using the technique for unlimited-context processing, "
|
| 126 |
+
"(4) Structured output layer with JSON/XML schema enforcement, "
|
| 127 |
+
"(5) REST API with streaming support for real-time processing. "
|
| 128 |
+
"Deploy on AWS with auto-scaling GPU instances (A10G for inference)."
|
| 129 |
+
),
|
| 130 |
+
"architecture_decisions": [
|
| 131 |
+
"Use streaming inference to handle arbitrarily long documents",
|
| 132 |
+
"Implement a hybrid retrieval + full-context approach for optimal accuracy",
|
| 133 |
+
"Deploy as a containerized microservice for horizontal scaling",
|
| 134 |
+
"Cache embeddings in Redis for repeated document access patterns",
|
| 135 |
+
],
|
| 136 |
+
"differentiation_strategy": (
|
| 137 |
+
"Unlike existing solutions (AWS Textract, Google Document AI, Azure Form "
|
| 138 |
+
"Recognizer), this product handles documents of ANY length without chunking "
|
| 139 |
+
"degradation. The core moat is the technique's linear-time complexity, "
|
| 140 |
+
"enabling 100x longer context at 1/10th the cost. Additional moats: "
|
| 141 |
+
"proprietary fine-tuning on 500K enterprise documents, and a self-improving "
|
| 142 |
+
"feedback loop where corrections from users improve the model continuously."
|
| 143 |
+
),
|
| 144 |
+
"dataset_requirements": (
|
| 145 |
+
"Initial training: (1) DocVQA (50K document-question pairs), "
|
| 146 |
+
"(2) SQuAD 2.0 for reading comprehension baseline, "
|
| 147 |
+
"(3) Contract Understanding Atticus Dataset (CUAD) for legal domain, "
|
| 148 |
+
"(4) FUNSD for form understanding. Proprietary data collection: "
|
| 149 |
+
"Partner with 3 enterprise customers for anonymized document datasets. "
|
| 150 |
+
"Synthetic data: Generate 100K long-document QA pairs using GPT-4."
|
| 151 |
+
),
|
| 152 |
+
"go_to_market": (
|
| 153 |
+
"Target: Legal tech firms and compliance teams (high document volume, "
|
| 154 |
+
"high accuracy requirements). Channel: Direct sales to Top-50 law firms, "
|
| 155 |
+
"integration partnerships with existing DMS providers (NetDocuments, iManage). "
|
| 156 |
+
"Pricing: Usage-based API pricing — $0.01/page for standard, $0.05/page for "
|
| 157 |
+
"premium with human-in-the-loop verification. First 90 days: 3 design partners, "
|
| 158 |
+
"10K documents processed, 95%+ accuracy on standard benchmarks."
|
| 159 |
+
),
|
| 160 |
+
"risk_assessment": (
|
| 161 |
+
"Technical risks: (1) Technique may not generalize to all document types — "
|
| 162 |
+
"mitigate with domain-specific fine-tuning. (2) Inference latency on very "
|
| 163 |
+
"long documents (>100 pages) — mitigate with streaming and caching. "
|
| 164 |
+
"Market risks: (1) Incumbent response — AWS/Google/Azure may adopt similar "
|
| 165 |
+
"techniques within 12 months — speed to market is critical. "
|
| 166 |
+
"Competitive risks: (1) Several well-funded startups in adjacent space — "
|
| 167 |
+
"differentiate on long-context capability."
|
| 168 |
+
),
|
| 169 |
+
"first_90_day_milestones": [
|
| 170 |
+
"Week 1-2: Core model integration and API scaffold",
|
| 171 |
+
"Week 3-4: Document ingestion pipeline with OCR",
|
| 172 |
+
"Week 5-6: Fine-tune on DocVQA and CUAD datasets",
|
| 173 |
+
"Week 7-8: API deployment with auth and rate limiting",
|
| 174 |
+
"Week 9-10: First design partner onboarding",
|
| 175 |
+
"Week 11-12: Benchmark publication and Product Hunt launch",
|
| 176 |
+
],
|
| 177 |
+
"suggested_stack": [
|
| 178 |
+
"PyTorch + HuggingFace Transformers",
|
| 179 |
+
"FastAPI + Pydantic",
|
| 180 |
+
"Redis for caching",
|
| 181 |
+
"PostgreSQL for metadata",
|
| 182 |
+
"AWS ECS + A10G GPU instances",
|
| 183 |
+
"Stripe for billing",
|
| 184 |
+
],
|
| 185 |
+
}
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
class BlueprintEngine:
|
| 190 |
+
"""Generates product blueprints from high-scoring research techniques."""
|
| 191 |
+
|
| 192 |
+
def __init__(self):
|
| 193 |
+
self.generated_blueprints: dict[str, ProductBlueprint] = {}
|
| 194 |
+
|
| 195 |
+
async def generate_blueprint(
|
| 196 |
+
self,
|
| 197 |
+
trend: TrendEntry,
|
| 198 |
+
additional_context: str = "",
|
| 199 |
+
) -> ProductBlueprint:
|
| 200 |
+
"""Generate a complete product blueprint for a technique.
|
| 201 |
+
|
| 202 |
+
Args:
|
| 203 |
+
trend: The trend entry to generate a blueprint for
|
| 204 |
+
additional_context: Additional research context
|
| 205 |
+
|
| 206 |
+
Returns:
|
| 207 |
+
Complete ProductBlueprint
|
| 208 |
+
"""
|
| 209 |
+
logger.info(f"Generating blueprint for: {trend.technique_name}")
|
| 210 |
+
|
| 211 |
+
if config.USE_MOCK_LLM or not config.LLM_API_KEY:
|
| 212 |
+
blueprint = self._generate_mock_blueprint(trend)
|
| 213 |
+
else:
|
| 214 |
+
blueprint = await self._generate_llm_blueprint(trend, additional_context)
|
| 215 |
+
|
| 216 |
+
self.generated_blueprints[blueprint.id] = blueprint
|
| 217 |
+
logger.info(f"Blueprint generated: {blueprint.id}")
|
| 218 |
+
return blueprint
|
| 219 |
+
|
| 220 |
+
async def _generate_llm_blueprint(
|
| 221 |
+
self, trend: TrendEntry, context: str
|
| 222 |
+
) -> ProductBlueprint:
|
| 223 |
+
"""Generate blueprint using Gemini LLM."""
|
| 224 |
+
prompt = (
|
| 225 |
+
f"Generate a complete startup product blueprint based on this "
|
| 226 |
+
f"emerging AI technique:\n\n"
|
| 227 |
+
f"Technique: {trend.technique_name}\n"
|
| 228 |
+
f"Description: {trend.description}\n"
|
| 229 |
+
f"Emergence Score: {trend.emergence_score}\n"
|
| 230 |
+
f"Impact Score: {trend.impact_score}\n"
|
| 231 |
+
f"Mainstream ETA: {trend.mainstream_eta_months} months\n\n"
|
| 232 |
+
f"Additional context:\n{context[:1500]}\n\n"
|
| 233 |
+
f"Please provide a JSON response with these fields:\n"
|
| 234 |
+
f"problem_statement, market_size, technical_implementation, "
|
| 235 |
+
f"architecture_decisions (list), differentiation_strategy, "
|
| 236 |
+
f"dataset_requirements, go_to_market, risk_assessment, "
|
| 237 |
+
f"first_90_day_milestones (list), suggested_stack (list)"
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
try:
|
| 241 |
+
async with httpx.AsyncClient(timeout=60.0) as client:
|
| 242 |
+
full_prompt = (
|
| 243 |
+
"You are a senior product strategist and AI architect. "
|
| 244 |
+
"Generate detailed, actionable product blueprints.\n"
|
| 245 |
+
"Return ONLY valid JSON with these keys: "
|
| 246 |
+
"problem_statement, market_size, technical_implementation, "
|
| 247 |
+
"architecture_decisions, differentiation_strategy, dataset_requirements, "
|
| 248 |
+
"go_to_market, risk_assessment, first_90_day_milestones, suggested_stack.\n\n"
|
| 249 |
+
f"{prompt}"
|
| 250 |
+
)
|
| 251 |
+
payload = {
|
| 252 |
+
"contents": [{"parts": [{"text": full_prompt}]}],
|
| 253 |
+
"generationConfig": {
|
| 254 |
+
"temperature": 0.7,
|
| 255 |
+
"maxOutputTokens": 8192,
|
| 256 |
+
"responseMimeType": "application/json",
|
| 257 |
+
},
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
# Gemini occasionally returns 503/overloaded. Retry with simple
|
| 261 |
+
# exponential backoff (3 attempts, 1.5s/3s/6s) before falling
|
| 262 |
+
# back to the mock template.
|
| 263 |
+
resp = None
|
| 264 |
+
last_exc: Optional[Exception] = None
|
| 265 |
+
for attempt, delay in enumerate((1.5, 3.0, 6.0), start=1):
|
| 266 |
+
try:
|
| 267 |
+
resp = await client.post(
|
| 268 |
+
f"{config.GEMINI_BASE_URL}/models/{config.LLM_MODEL}:generateContent",
|
| 269 |
+
params={"key": config.LLM_API_KEY},
|
| 270 |
+
headers={"Content-Type": "application/json"},
|
| 271 |
+
json=payload,
|
| 272 |
+
)
|
| 273 |
+
if resp.status_code in (500, 502, 503, 504, 429):
|
| 274 |
+
logger.warning(
|
| 275 |
+
"Gemini blueprint attempt %s returned %s; retrying",
|
| 276 |
+
attempt,
|
| 277 |
+
resp.status_code,
|
| 278 |
+
)
|
| 279 |
+
last_exc = httpx.HTTPStatusError(
|
| 280 |
+
f"Gemini transient {resp.status_code}",
|
| 281 |
+
request=resp.request,
|
| 282 |
+
response=resp,
|
| 283 |
+
)
|
| 284 |
+
await asyncio.sleep(delay)
|
| 285 |
+
continue
|
| 286 |
+
last_exc = None
|
| 287 |
+
break
|
| 288 |
+
except (httpx.ConnectError, httpx.ReadTimeout, httpx.RemoteProtocolError) as e:
|
| 289 |
+
last_exc = e
|
| 290 |
+
logger.warning("Gemini blueprint network error attempt %s: %s", attempt, e)
|
| 291 |
+
await asyncio.sleep(delay)
|
| 292 |
+
continue
|
| 293 |
+
if resp is None or last_exc is not None:
|
| 294 |
+
if last_exc is not None:
|
| 295 |
+
raise last_exc
|
| 296 |
+
raise RuntimeError("Gemini blueprint: no response")
|
| 297 |
+
resp.raise_for_status()
|
| 298 |
+
data = resp.json()
|
| 299 |
+
candidates = data.get("candidates", [])
|
| 300 |
+
if not candidates:
|
| 301 |
+
raise ValueError("No Gemini candidates returned")
|
| 302 |
+
finish_reason = candidates[0].get("finishReason", "")
|
| 303 |
+
parts = candidates[0].get("content", {}).get("parts", [])
|
| 304 |
+
content = "".join(p.get("text", "") for p in parts if isinstance(p, dict)).strip()
|
| 305 |
+
if not content:
|
| 306 |
+
raise ValueError("Empty Gemini response text")
|
| 307 |
+
try:
|
| 308 |
+
bp_data = json.loads(content)
|
| 309 |
+
except json.JSONDecodeError as je:
|
| 310 |
+
if finish_reason == "MAX_TOKENS":
|
| 311 |
+
logger.warning(
|
| 312 |
+
"Gemini blueprint truncated by MAX_TOKENS; attempting repair"
|
| 313 |
+
)
|
| 314 |
+
bp_data = _repair_truncated_json(content)
|
| 315 |
+
if bp_data is None:
|
| 316 |
+
raise je
|
| 317 |
+
|
| 318 |
+
return ProductBlueprint(
|
| 319 |
+
technique_name=trend.technique_name,
|
| 320 |
+
trend_id=trend.id,
|
| 321 |
+
problem_statement=_coerce_text(bp_data.get("problem_statement", "")),
|
| 322 |
+
market_size=_coerce_text(bp_data.get("market_size", "")),
|
| 323 |
+
technical_implementation=_coerce_text(bp_data.get("technical_implementation", "")),
|
| 324 |
+
architecture_decisions=_coerce_str_list(bp_data.get("architecture_decisions", [])),
|
| 325 |
+
differentiation_strategy=_coerce_text(bp_data.get("differentiation_strategy", "")),
|
| 326 |
+
dataset_requirements=_coerce_text(bp_data.get("dataset_requirements", "")),
|
| 327 |
+
go_to_market=_coerce_text(bp_data.get("go_to_market", "")),
|
| 328 |
+
risk_assessment=_coerce_text(bp_data.get("risk_assessment", "")),
|
| 329 |
+
first_90_day_milestones=_coerce_str_list(bp_data.get("first_90_day_milestones", [])),
|
| 330 |
+
suggested_stack=_coerce_str_list(bp_data.get("suggested_stack", [])),
|
| 331 |
+
)
|
| 332 |
+
except Exception as e:
|
| 333 |
+
logger.error(f"LLM blueprint generation failed: {e}")
|
| 334 |
+
return self._generate_mock_blueprint(trend)
|
| 335 |
+
|
| 336 |
+
def _generate_mock_blueprint(self, trend: TrendEntry) -> ProductBlueprint:
|
| 337 |
+
"""Generate a mock blueprint for demo purposes."""
|
| 338 |
+
mock = MOCK_BLUEPRINTS["default"]
|
| 339 |
+
return ProductBlueprint(
|
| 340 |
+
technique_name=trend.technique_name,
|
| 341 |
+
trend_id=trend.id,
|
| 342 |
+
problem_statement=mock["problem_statement"],
|
| 343 |
+
market_size=mock["market_size"],
|
| 344 |
+
technical_implementation=mock["technical_implementation"],
|
| 345 |
+
architecture_decisions=mock["architecture_decisions"],
|
| 346 |
+
differentiation_strategy=mock["differentiation_strategy"],
|
| 347 |
+
dataset_requirements=mock["dataset_requirements"],
|
| 348 |
+
go_to_market=mock["go_to_market"],
|
| 349 |
+
risk_assessment=mock["risk_assessment"],
|
| 350 |
+
first_90_day_milestones=mock["first_90_day_milestones"],
|
| 351 |
+
suggested_stack=mock["suggested_stack"],
|
| 352 |
+
)
|
| 353 |
+
|
| 354 |
+
def get_blueprint(self, blueprint_id: str) -> Optional[ProductBlueprint]:
|
| 355 |
+
return self.generated_blueprints.get(blueprint_id)
|
| 356 |
+
|
| 357 |
+
def list_blueprints(self) -> list[ProductBlueprint]:
|
| 358 |
+
return list(self.generated_blueprints.values())
|
intelligence/experiment_designer.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Experiment Designer — Autonomous Scientific Validation Engine.
|
| 2 |
+
|
| 3 |
+
Follows Section 6.5: Generates minimal viable experiments to validate
|
| 4 |
+
research hypotheses derived from product blueprints.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
from typing import Optional
|
| 9 |
+
import config
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger("vectormind.intelligence")
|
| 12 |
+
|
| 13 |
+
class ExperimentDesigner:
|
| 14 |
+
"""Generates scientific experiment designs for research validation."""
|
| 15 |
+
|
| 16 |
+
def __init__(self):
|
| 17 |
+
self.template = """
|
| 18 |
+
# Experiment Design: {technique_name} Validation
|
| 19 |
+
# Generated by VectorMind Autonomous Agent
|
| 20 |
+
|
| 21 |
+
## 1. Research Hypothesis
|
| 22 |
+
{hypothesis}
|
| 23 |
+
|
| 24 |
+
## 2. Minimal Viable Experiment (MVE)
|
| 25 |
+
- Dataset: {dataset_suggestion}
|
| 26 |
+
- Model Architecture: {model_suggestion}
|
| 27 |
+
- Key Metric: {key_metric}
|
| 28 |
+
- Target Baseline: {baseline}
|
| 29 |
+
|
| 30 |
+
## 3. Implementation (PyTorch/HuggingFace)
|
| 31 |
+
```python
|
| 32 |
+
import torch
|
| 33 |
+
import transformers
|
| 34 |
+
# ... auto-generated experiment code ...
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
## 4. Pass/Fail Verdict Criteria
|
| 38 |
+
The experiment is considered a PASS if the {key_metric} exceeds {baseline}
|
| 39 |
+
with a 95% confidence interval across 5 runs.
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
async def design_experiment(self, technique_name: str, brief: str) -> dict:
|
| 43 |
+
"""Generate a structured experiment design based on a technical brief."""
|
| 44 |
+
logger.info(f"Designing experiment for: {technique_name}")
|
| 45 |
+
|
| 46 |
+
# In a real implementation, this would call the LLM (Groq)
|
| 47 |
+
# to reason about the smallest possible validation experiment.
|
| 48 |
+
|
| 49 |
+
hypothesis = f"Applying {technique_name} will improve state-of-the-art efficiency by at least 15%."
|
| 50 |
+
|
| 51 |
+
experiment = {
|
| 52 |
+
"technique_name": technique_name,
|
| 53 |
+
"hypothesis": hypothesis,
|
| 54 |
+
"dataset_suggestion": "Tiny-ImageNet (Subsampled)" if "vision" in brief.lower() else "WikiText-2",
|
| 55 |
+
"model_suggestion": "MobileNet-V3" if "vision" in brief.lower() else "DistilBERT",
|
| 56 |
+
"key_metric": "Top-1 Accuracy" if "vision" in brief.lower() else "Perplexity",
|
| 57 |
+
"baseline": "0.72" if "vision" in brief.lower() else "24.5",
|
| 58 |
+
"notebook_content": self.template.format(
|
| 59 |
+
technique_name=technique_name,
|
| 60 |
+
hypothesis=hypothesis,
|
| 61 |
+
dataset_suggestion="Tiny-ImageNet" if "vision" in brief.lower() else "WikiText-2",
|
| 62 |
+
model_suggestion="MobileNet-V3" if "vision" in brief.lower() else "DistilBERT",
|
| 63 |
+
key_metric="Top-1 Accuracy" if "vision" in brief.lower() else "Perplexity",
|
| 64 |
+
baseline="0.72" if "vision" in brief.lower() else "24.5"
|
| 65 |
+
)
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
return experiment
|
intelligence/pipeline_executor.py
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pipeline execution engine.
|
| 2 |
+
|
| 3 |
+
Runs generated pipeline scripts as background subprocesses, captures logs,
|
| 4 |
+
and exposes runtime status/artifact locations.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import asyncio
|
| 10 |
+
import os
|
| 11 |
+
import sys
|
| 12 |
+
import uuid
|
| 13 |
+
from dataclasses import dataclass, field
|
| 14 |
+
from datetime import datetime, timezone
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from typing import Awaitable, Callable, Optional
|
| 17 |
+
|
| 18 |
+
import config
|
| 19 |
+
from ingestion.schema import MLPipeline
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _now_iso() -> str:
|
| 23 |
+
return datetime.now(timezone.utc).isoformat()
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class PipelineRun:
|
| 28 |
+
run_id: str
|
| 29 |
+
pipeline_id: str
|
| 30 |
+
status: str = "queued" # queued|running|completed|failed|timeout
|
| 31 |
+
created_at: str = field(default_factory=_now_iso)
|
| 32 |
+
started_at: Optional[str] = None
|
| 33 |
+
finished_at: Optional[str] = None
|
| 34 |
+
exit_code: Optional[int] = None
|
| 35 |
+
duration_seconds: Optional[float] = None
|
| 36 |
+
error: str = ""
|
| 37 |
+
run_dir: str = ""
|
| 38 |
+
script_path: str = ""
|
| 39 |
+
log_path: str = ""
|
| 40 |
+
artifacts_dir: str = ""
|
| 41 |
+
command: list[str] = field(default_factory=list)
|
| 42 |
+
retry_count: int = 0
|
| 43 |
+
max_retries: int = 0
|
| 44 |
+
|
| 45 |
+
def to_dict(self) -> dict:
|
| 46 |
+
return {
|
| 47 |
+
"run_id": self.run_id,
|
| 48 |
+
"pipeline_id": self.pipeline_id,
|
| 49 |
+
"status": self.status,
|
| 50 |
+
"created_at": self.created_at,
|
| 51 |
+
"started_at": self.started_at,
|
| 52 |
+
"finished_at": self.finished_at,
|
| 53 |
+
"exit_code": self.exit_code,
|
| 54 |
+
"duration_seconds": self.duration_seconds,
|
| 55 |
+
"error": self.error,
|
| 56 |
+
"run_dir": self.run_dir,
|
| 57 |
+
"script_path": self.script_path,
|
| 58 |
+
"log_path": self.log_path,
|
| 59 |
+
"artifacts_dir": self.artifacts_dir,
|
| 60 |
+
"command": self.command,
|
| 61 |
+
"retry_count": self.retry_count,
|
| 62 |
+
"max_retries": self.max_retries,
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class PipelineExecutor:
|
| 67 |
+
"""Execute generated pipelines in managed local run directories."""
|
| 68 |
+
|
| 69 |
+
def __init__(
|
| 70 |
+
self,
|
| 71 |
+
max_concurrent_runs: int | None = None,
|
| 72 |
+
max_retries: int | None = None,
|
| 73 |
+
retry_backoff_seconds: int | None = None,
|
| 74 |
+
on_state_change: Optional[Callable[[dict], Awaitable[None] | None]] = None,
|
| 75 |
+
):
|
| 76 |
+
self.base_dir = Path(config.DATA_DIR) / "pipeline_runs"
|
| 77 |
+
self.base_dir.mkdir(parents=True, exist_ok=True)
|
| 78 |
+
self.runs_by_pipeline: dict[str, dict[str, PipelineRun]] = {}
|
| 79 |
+
self.tasks: dict[str, asyncio.Task] = {}
|
| 80 |
+
self.max_concurrent_runs = max_concurrent_runs or config.PIPELINE_MAX_CONCURRENT_RUNS
|
| 81 |
+
self.max_retries = max_retries if max_retries is not None else config.PIPELINE_MAX_RETRIES
|
| 82 |
+
self.retry_backoff_seconds = (
|
| 83 |
+
retry_backoff_seconds
|
| 84 |
+
if retry_backoff_seconds is not None
|
| 85 |
+
else config.PIPELINE_RETRY_BACKOFF_SECONDS
|
| 86 |
+
)
|
| 87 |
+
self._semaphore = asyncio.Semaphore(max(1, self.max_concurrent_runs))
|
| 88 |
+
self.on_state_change = on_state_change
|
| 89 |
+
|
| 90 |
+
def _register_run(self, run: PipelineRun):
|
| 91 |
+
bucket = self.runs_by_pipeline.setdefault(run.pipeline_id, {})
|
| 92 |
+
bucket[run.run_id] = run
|
| 93 |
+
|
| 94 |
+
async def _emit_state(self, run: PipelineRun):
|
| 95 |
+
if not self.on_state_change:
|
| 96 |
+
return
|
| 97 |
+
result = self.on_state_change(run.to_dict())
|
| 98 |
+
if asyncio.iscoroutine(result):
|
| 99 |
+
await result
|
| 100 |
+
|
| 101 |
+
def list_runs(self, pipeline_id: str) -> list[dict]:
|
| 102 |
+
runs = list(self.runs_by_pipeline.get(pipeline_id, {}).values())
|
| 103 |
+
runs.sort(key=lambda r: r.created_at, reverse=True)
|
| 104 |
+
return [r.to_dict() for r in runs]
|
| 105 |
+
|
| 106 |
+
def get_run(self, pipeline_id: str, run_id: str) -> Optional[dict]:
|
| 107 |
+
run = self.runs_by_pipeline.get(pipeline_id, {}).get(run_id)
|
| 108 |
+
return run.to_dict() if run else None
|
| 109 |
+
|
| 110 |
+
def _prepare_run_files(self, pipeline: MLPipeline, run_id: str) -> PipelineRun:
|
| 111 |
+
run_dir = self.base_dir / pipeline.id / run_id
|
| 112 |
+
run_dir.mkdir(parents=True, exist_ok=True)
|
| 113 |
+
|
| 114 |
+
script_path = run_dir / "pipeline.py"
|
| 115 |
+
log_path = run_dir / "run.log"
|
| 116 |
+
artifacts_dir = run_dir / "artifacts"
|
| 117 |
+
artifacts_dir.mkdir(parents=True, exist_ok=True)
|
| 118 |
+
|
| 119 |
+
script_header = (
|
| 120 |
+
"import os\n"
|
| 121 |
+
"import subprocess\n"
|
| 122 |
+
f"os.environ.setdefault('VECTOR_MINDS_ARTIFACT_DIR', r'{artifacts_dir.as_posix()}')\n\n"
|
| 123 |
+
)
|
| 124 |
+
script_body = self._transpile_notebook_to_python(pipeline.notebook_content)
|
| 125 |
+
script_path.write_text(script_header + script_body, encoding="utf-8")
|
| 126 |
+
|
| 127 |
+
return PipelineRun(
|
| 128 |
+
run_id=run_id,
|
| 129 |
+
pipeline_id=pipeline.id,
|
| 130 |
+
run_dir=str(run_dir),
|
| 131 |
+
script_path=str(script_path),
|
| 132 |
+
log_path=str(log_path),
|
| 133 |
+
artifacts_dir=str(artifacts_dir),
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
def _transpile_notebook_to_python(self, text: str) -> str:
|
| 137 |
+
"""Convert lightweight notebook magics to runnable Python."""
|
| 138 |
+
out_lines: list[str] = []
|
| 139 |
+
for raw in text.splitlines():
|
| 140 |
+
stripped = raw.lstrip()
|
| 141 |
+
indent = raw[: len(raw) - len(stripped)]
|
| 142 |
+
if stripped.startswith("!"):
|
| 143 |
+
shell_cmd = stripped[1:].strip().replace("\\", "\\\\").replace('"', '\\"')
|
| 144 |
+
out_lines.append(
|
| 145 |
+
f'{indent}subprocess.check_call("{shell_cmd}", shell=True)'
|
| 146 |
+
)
|
| 147 |
+
continue
|
| 148 |
+
if stripped.startswith("%"):
|
| 149 |
+
out_lines.append(f"{indent}# skipped notebook magic: {stripped}")
|
| 150 |
+
continue
|
| 151 |
+
out_lines.append(raw)
|
| 152 |
+
return "\n".join(out_lines) + "\n"
|
| 153 |
+
|
| 154 |
+
async def _run_subprocess_once(self, run: PipelineRun, timeout_seconds: int):
|
| 155 |
+
run.started_at = _now_iso()
|
| 156 |
+
run.status = "running"
|
| 157 |
+
start = datetime.now(timezone.utc)
|
| 158 |
+
await self._emit_state(run)
|
| 159 |
+
|
| 160 |
+
cmd = [sys.executable, "-u", run.script_path]
|
| 161 |
+
run.command = cmd
|
| 162 |
+
|
| 163 |
+
env = os.environ.copy()
|
| 164 |
+
env["PYTHONUNBUFFERED"] = "1"
|
| 165 |
+
env["VECTOR_MINDS_RUN_ID"] = run.run_id
|
| 166 |
+
env["VECTOR_MINDS_PIPELINE_ID"] = run.pipeline_id
|
| 167 |
+
env["VECTOR_MINDS_ARTIFACT_DIR"] = run.artifacts_dir
|
| 168 |
+
|
| 169 |
+
with open(run.log_path, "w", encoding="utf-8") as log_file:
|
| 170 |
+
process = await asyncio.create_subprocess_exec(
|
| 171 |
+
*cmd,
|
| 172 |
+
cwd=run.run_dir,
|
| 173 |
+
stdout=log_file,
|
| 174 |
+
stderr=log_file,
|
| 175 |
+
env=env,
|
| 176 |
+
)
|
| 177 |
+
try:
|
| 178 |
+
await asyncio.wait_for(process.wait(), timeout=timeout_seconds)
|
| 179 |
+
run.exit_code = process.returncode
|
| 180 |
+
run.status = "completed" if process.returncode == 0 else "failed"
|
| 181 |
+
except asyncio.TimeoutError:
|
| 182 |
+
process.kill()
|
| 183 |
+
await process.wait()
|
| 184 |
+
run.exit_code = process.returncode
|
| 185 |
+
run.status = "timeout"
|
| 186 |
+
run.error = f"Execution exceeded timeout ({timeout_seconds}s)"
|
| 187 |
+
|
| 188 |
+
end = datetime.now(timezone.utc)
|
| 189 |
+
run.finished_at = _now_iso()
|
| 190 |
+
run.duration_seconds = round((end - start).total_seconds(), 3)
|
| 191 |
+
await self._emit_state(run)
|
| 192 |
+
|
| 193 |
+
async def _run_with_retry(self, run: PipelineRun, timeout_seconds: int):
|
| 194 |
+
run.max_retries = max(0, self.max_retries)
|
| 195 |
+
async with self._semaphore:
|
| 196 |
+
while True:
|
| 197 |
+
await self._run_subprocess_once(run, timeout_seconds=timeout_seconds)
|
| 198 |
+
if run.status == "completed":
|
| 199 |
+
return
|
| 200 |
+
if run.retry_count >= run.max_retries:
|
| 201 |
+
return
|
| 202 |
+
run.retry_count += 1
|
| 203 |
+
run.status = "queued"
|
| 204 |
+
run.error = ""
|
| 205 |
+
run.exit_code = None
|
| 206 |
+
run.started_at = None
|
| 207 |
+
run.finished_at = None
|
| 208 |
+
run.duration_seconds = None
|
| 209 |
+
await self._emit_state(run)
|
| 210 |
+
await asyncio.sleep(max(1, self.retry_backoff_seconds))
|
| 211 |
+
|
| 212 |
+
async def execute_pipeline(
|
| 213 |
+
self,
|
| 214 |
+
pipeline: MLPipeline,
|
| 215 |
+
timeout_seconds: int = 1800,
|
| 216 |
+
) -> dict:
|
| 217 |
+
run_id = str(uuid.uuid4())
|
| 218 |
+
run = self._prepare_run_files(pipeline, run_id)
|
| 219 |
+
self._register_run(run)
|
| 220 |
+
await self._run_with_retry(run, timeout_seconds=timeout_seconds)
|
| 221 |
+
return run.to_dict()
|
| 222 |
+
|
| 223 |
+
def execute_pipeline_async(
|
| 224 |
+
self,
|
| 225 |
+
pipeline: MLPipeline,
|
| 226 |
+
timeout_seconds: int = 1800,
|
| 227 |
+
) -> dict:
|
| 228 |
+
run_id = str(uuid.uuid4())
|
| 229 |
+
run = self._prepare_run_files(pipeline, run_id)
|
| 230 |
+
self._register_run(run)
|
| 231 |
+
run.max_retries = max(0, self.max_retries)
|
| 232 |
+
|
| 233 |
+
task = asyncio.create_task(self._run_with_retry(run, timeout_seconds=timeout_seconds))
|
| 234 |
+
self.tasks[run_id] = task
|
| 235 |
+
|
| 236 |
+
def _cleanup(_):
|
| 237 |
+
self.tasks.pop(run_id, None)
|
| 238 |
+
|
| 239 |
+
task.add_done_callback(_cleanup)
|
| 240 |
+
return run.to_dict()
|