From 0f1375912bc86e8aa6362fd878637a6fe4bddaf1 Mon Sep 17 00:00:00 2001 From: anti Date: Sun, 10 May 2026 06:17:32 -0400 Subject: [PATCH] feat(text): initial decnet_behave_text spec + tests Text/messaging-domain behavioral observation registry layered on core. SPDX: GPL-3.0-or-later (code) / CC-BY-SA-4.0 (attribution-recipes.md). Co-Authored-By: Claude Opus 4.7 --- BEHAVE-TEXT/attribution-recipes.md | 31 ++ BEHAVE-TEXT/decnet_behave_text/__init__.py | 0 .../decnet_behave_text/spec/__init__.py | 43 +++ .../decnet_behave_text/spec/envelope.py | 53 ++++ .../decnet_behave_text/spec/primitives.py | 290 ++++++++++++++++++ BEHAVE-TEXT/json/observation.schema.json | 144 +++++++++ BEHAVE-TEXT/pyproject.toml | 33 ++ BEHAVE-TEXT/scripts/generate_schema.py | 42 +++ BEHAVE-TEXT/tests/test_primitives.py | 101 ++++++ 9 files changed, 737 insertions(+) create mode 100644 BEHAVE-TEXT/attribution-recipes.md create mode 100644 BEHAVE-TEXT/decnet_behave_text/__init__.py create mode 100644 BEHAVE-TEXT/decnet_behave_text/spec/__init__.py create mode 100644 BEHAVE-TEXT/decnet_behave_text/spec/envelope.py create mode 100644 BEHAVE-TEXT/decnet_behave_text/spec/primitives.py create mode 100644 BEHAVE-TEXT/json/observation.schema.json create mode 100644 BEHAVE-TEXT/pyproject.toml create mode 100644 BEHAVE-TEXT/scripts/generate_schema.py create mode 100644 BEHAVE-TEXT/tests/test_primitives.py diff --git a/BEHAVE-TEXT/attribution-recipes.md b/BEHAVE-TEXT/attribution-recipes.md new file mode 100644 index 0000000..14836f5 --- /dev/null +++ b/BEHAVE-TEXT/attribution-recipes.md @@ -0,0 +1,31 @@ + + +# BEHAVE-TEXT Attribution Recipes + +> **This document is not part of BEHAVE-TEXT.** BEHAVE-TEXT (`scratchpad.md`) defines the observation taxonomy and emission envelope. It does **not** assert who an actor is, link sessions, or assign profiles. Those are attribution-engine concerns. +> +> This document is a **placeholder**. Recipes for the text domain wait for corpus calibration. The Rutify Telegram corpus (forthcoming) will be the labeling ground truth that drives the first concrete profiles. + +--- + +## What goes here eventually + +When BEHAVE-TEXT has a calibrated corpus, this document will mirror BEHAVE-SHELL's `attribution-recipes.md` structure: + +1. **Engine Interface** — what the engine consumes from BEHAVE-TEXT (`actor.observation.text.*` topics) plus user-supplied labels (`identity.label.applied`); what it emits (`attribution.profile.candidate`, `attribution.profile.current`, `attribution.linkage.proposed`). +2. **Profile Recipes** — observation-pattern definitions for each text-domain operator class. Likely starting points based on the Rutify domain: + - `credential_broker` — high transactional_language, high boasting_pattern, broadcast attention_pattern. + - `low_skill_buyer` — low vocabulary_richness, slow response_latency, high question_formation_style:lexical. + - `group_admin` — high conversation_initiation_rate, focused attention_pattern, high opsec_awareness. + - `lurker_or_observer` — minimal message volume, near-zero conversation_initiation_rate. + - `bot_or_automated_poster` — perfect punctuation_style consistency, no typo_signature, machine-pasted message_length distribution. +3. **Linkage Rules** — rules for proposing identity links across accounts based on stylometric signature similarity. The function_word_distribution simhash is the load-bearing primitive here (Hamming-comparable across sessions, hard to consciously fake). +4. **User-Owned Topic Schemas** — `identity.label.applied` and `identity.engagement.authorized` schemas for the text domain. + +## What stays out + +Same boundary as BEHAVE-SHELL's recipes: profiles describe observation *patterns*, not operator types. Engines combine BEHAVE-TEXT primitives with BEHAVE-SHELL primitives (when the same identity appears in both substrates) and with user-supplied labels to produce attribution. + +## Status + +**Empty until the Rutify corpus is processed.** Adding speculative recipes here without corpus validation would repeat the v0.1 mistake of emitting confidently-wrong observations. The five labelled BEHAVE-SHELL sessions (HUMAN, YOU-sim, LW-sim, CLAUDE-FF, CLAUDE-CL) are the model: profiles get written *after* a labelled calibration grid exists, not before. diff --git a/BEHAVE-TEXT/decnet_behave_text/__init__.py b/BEHAVE-TEXT/decnet_behave_text/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/BEHAVE-TEXT/decnet_behave_text/spec/__init__.py b/BEHAVE-TEXT/decnet_behave_text/spec/__init__.py new file mode 100644 index 0000000..723e293 --- /dev/null +++ b/BEHAVE-TEXT/decnet_behave_text/spec/__init__.py @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +"""BEHAVE-TEXT spec — text/messaging-domain registry, layered on decnet-behave-core. + +Public API: + + from spec import Observation, Window, OBSERVATION_SCHEMA_VERSION + from spec import PRIMITIVE_REGISTRY, ValueKind, ValueTypeSpec + from spec import TOPIC_PREFIX, event_topic_for + +The ``Observation`` exported here is a registry-aware subclass of the base +class from ``decnet-behave-core``; it validates that ``primitive`` is in the +text registry and that ``value`` matches the registry's per-primitive spec. + +See ``spec.envelope`` (and the core envelope module) for PII discipline. +""" + +from .envelope import OBSERVATION_SCHEMA_VERSION, Observation, ObservationValue, Window +from .primitives import PRIMITIVE_REGISTRY, ValueKind, ValueTypeSpec, get, is_known + +# Topic namespace deliberately uses *actor* (not *attacker*) because chat-group +# members may include observers, brokers, victims, and bystanders alongside +# threat actors. Attribution of role is the engine's job, not BEHAVE-TEXT's. +TOPIC_PREFIX: str = "actor.observation.text" + + +def event_topic_for(primitive: str) -> str: + """Return the canonical bus topic for a BEHAVE-TEXT primitive.""" + return f"{TOPIC_PREFIX}.{primitive}" + + +__all__ = [ + "OBSERVATION_SCHEMA_VERSION", + "Observation", + "ObservationValue", + "Window", + "PRIMITIVE_REGISTRY", + "ValueKind", + "ValueTypeSpec", + "is_known", + "get", + "TOPIC_PREFIX", + "event_topic_for", +] diff --git a/BEHAVE-TEXT/decnet_behave_text/spec/envelope.py b/BEHAVE-TEXT/decnet_behave_text/spec/envelope.py new file mode 100644 index 0000000..ce697ef --- /dev/null +++ b/BEHAVE-TEXT/decnet_behave_text/spec/envelope.py @@ -0,0 +1,53 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +"""BEHAVE-TEXT Observation envelope (registry-aware subclass). + +Mirrors BEHAVE-SHELL's pattern: structural envelope from `decnet-behave-core`, +registry-aware validation added here against BEHAVE-TEXT's `PRIMITIVE_REGISTRY`. + +PII discipline (TIGHTER for text than for shell): + text-domain sensors operate on raw message bodies. They MUST hash, aggregate, + or categorize before constructing an Observation — never put message text + into the `value` or `evidence_ref` field. `evidence_ref` should point at an + external message-store record (e.g. a Telegram message ID), not at the text. +""" + +from __future__ import annotations + +from pydantic import model_validator + +from decnet_behave_core.spec.envelope import ( + OBSERVATION_SCHEMA_VERSION, + ObservationValue, + Window, +) +from decnet_behave_core.spec.envelope import Observation as _BaseObservation + +from .primitives import PRIMITIVE_REGISTRY + + +class Observation(_BaseObservation): + """Text-domain Observation: base envelope + BEHAVE-TEXT registry check.""" + + @model_validator(mode="after") + def _validate_against_text_registry(self) -> "Observation": + spec = PRIMITIVE_REGISTRY.get(self.primitive) + if spec is None: + raise ValueError( + f"unknown primitive {self.primitive!r}; " + f"add it to spec/primitives.py:PRIMITIVE_REGISTRY first" + ) + try: + spec.validate_value(self.value) + except ValueError as exc: + raise ValueError( + f"value invalid for primitive {self.primitive!r}: {exc}" + ) from None + return self + + +__all__ = [ + "OBSERVATION_SCHEMA_VERSION", + "Observation", + "ObservationValue", + "Window", +] diff --git a/BEHAVE-TEXT/decnet_behave_text/spec/primitives.py b/BEHAVE-TEXT/decnet_behave_text/spec/primitives.py new file mode 100644 index 0000000..2ae7926 --- /dev/null +++ b/BEHAVE-TEXT/decnet_behave_text/spec/primitives.py @@ -0,0 +1,290 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +"""BEHAVE-TEXT primitive registry. + +Source-of-truth for what `Observation.primitive` may be in the text/messaging +domain and what `Observation.value` must look like. Mirrors every row in the +primitive tables of `scratchpad.md`. + +PII discipline notice (carried over from decnet-behave-core's envelope module): + TEXT-domain observations carry CATEGORICAL LABELS, AGGREGATE RATES, and + HASHES of distributions. Sensors operating on Telegram/messaging text MUST + NOT emit raw message content into BEHAVE-TEXT observations — only derived + features. The `evidence_ref` field points to the underlying message store + held elsewhere; never into the message body itself. + + This is a tighter constraint than BEHAVE-SHELL's because the source signal + IS text content. Sensors must hash/aggregate before emitting. + +Adding a new primitive is a deliberate registry edit. Drift between this file +and `scratchpad.md` is a bug; v0 keeps the registry hand-written so PR review +catches drift, v0.x may auto-extract from the markdown if drift becomes a +maintenance issue. + +Status flags appear in the `notes` field. `EXPERIMENTAL` marks primitives in +the `content.*` layer whose detector implementations are likely brittle; an +attribution engine may choose to weight those at zero until field-validated. +""" + +from __future__ import annotations + +from enum import Enum +from typing import Any, Optional + +from pydantic import BaseModel, Field + + +class ValueKind(str, Enum): + """Discriminator for the shape an `Observation.value` must take.""" + + CATEGORICAL = "categorical" + NUMERIC = "numeric" + HASH = "hash" + ARRAY = "array" + FREE_STRING = "free_string" + BOOL = "bool" + + +class ValueTypeSpec(BaseModel): + """Per-primitive value-type spec (mirrors BEHAVE-SHELL's shape).""" + + kind: ValueKind + allowed: Optional[list[str]] = Field(default=None) + min_val: Optional[float] = Field(default=None) + max_val: Optional[float] = Field(default=None) + array_of: Optional[ValueKind] = Field(default=None) + notes: Optional[str] = Field(default=None) + + def validate_value(self, value: Any) -> None: + if self.kind is ValueKind.CATEGORICAL: + if not isinstance(value, str): + raise ValueError(f"expected categorical string, got {type(value).__name__}") + if self.allowed is not None and value not in self.allowed: + raise ValueError(f"value {value!r} not in allowed set {self.allowed!r}") + elif self.kind is ValueKind.NUMERIC: + if isinstance(value, bool) or not isinstance(value, (int, float)): + raise ValueError(f"expected numeric, got {type(value).__name__}") + if self.min_val is not None and value < self.min_val: + raise ValueError(f"value {value} below min_val {self.min_val}") + if self.max_val is not None and value > self.max_val: + raise ValueError(f"value {value} above max_val {self.max_val}") + elif self.kind is ValueKind.HASH: + if not isinstance(value, str) or not value: + raise ValueError("expected non-empty hash string") + elif self.kind is ValueKind.FREE_STRING: + if not isinstance(value, str): + raise ValueError(f"expected string, got {type(value).__name__}") + elif self.kind is ValueKind.BOOL: + if not isinstance(value, bool): + raise ValueError(f"expected bool, got {type(value).__name__}") + elif self.kind is ValueKind.ARRAY: + if not isinstance(value, list): + raise ValueError(f"expected array, got {type(value).__name__}") + if self.array_of is None: + return + element_spec = ValueTypeSpec(kind=self.array_of) + for i, element in enumerate(value): + try: + element_spec.validate_value(element) + except ValueError as exc: + raise ValueError(f"array element [{i}]: {exc}") from None + + +# ─── Convenience constructors ─────────────────────────────────────────────── + +def _cat(*allowed: str, notes: Optional[str] = None) -> ValueTypeSpec: + return ValueTypeSpec(kind=ValueKind.CATEGORICAL, allowed=list(allowed), notes=notes) + +def _num(min_val: Optional[float] = None, max_val: Optional[float] = None, notes: Optional[str] = None) -> ValueTypeSpec: + return ValueTypeSpec(kind=ValueKind.NUMERIC, min_val=min_val, max_val=max_val, notes=notes) + +def _hash(notes: Optional[str] = None) -> ValueTypeSpec: + return ValueTypeSpec(kind=ValueKind.HASH, notes=notes) + +def _str(notes: Optional[str] = None) -> ValueTypeSpec: + return ValueTypeSpec(kind=ValueKind.FREE_STRING, notes=notes) + +def _array(of: ValueKind, notes: Optional[str] = None) -> ValueTypeSpec: + return ValueTypeSpec(kind=ValueKind.ARRAY, array_of=of, notes=notes) + + +# ─── The registry ─────────────────────────────────────────────────────────── +# +# 28 primitives across 4 layers. Mirrors scratchpad.md row-for-row. + +PRIMITIVE_REGISTRY: dict[str, ValueTypeSpec] = { + # ── stylometric.* (motor analog — 8) ────────────────────────────────── + "stylometric.punctuation_style": _hash(notes="canonical punctuation-pattern fingerprint"), + "stylometric.capitalization_habit": _cat("lowercase", "proper", "random_caps", "mixed_i"), + "stylometric.emoji_usage": _cat("none", "occasional", "frequent", "exclusive"), + "stylometric.emoji_placement": _cat("pre_punctuation", "post_punctuation", "no_punctuation", "mixed"), + "stylometric.message_length_class": _cat("short", "medium", "long", "paragraph"), + "stylometric.message_length_variance_class": _cat( + "tight", "varied", "bimodal", + notes="Coefficient of variation of per-message word counts. Captures " + "DISTRIBUTION SHAPE that message_length_class collapses by " + "emitting only the median bucket. Two authors can share the same " + "median length but have wildly different variance: `tight` (CV<0.5) " + "= consistent (always 1-3 words), `varied` (0.5<=CV<1.5) = normal " + "mix, `bimodal` (CV>=1.5) = long-tail (mostly short with occasional " + "rants). Added in v0.2 after Rutify calibration found median-only " + "bucketing discarded most of the per-author variance signal.", + ), + "stylometric.linebreak_style": _cat("single_thought", "multi_line", "wall_of_text"), + "stylometric.typo_signature": _hash(notes="sha256 of canonical persistent-typo set"), + "stylometric.function_word_distribution_top50": _hash( + notes="64-bit simhash over the 50-most-common Spanish function-word frequency " + "vector. Mosteller-Wallace gold standard for English long-form authorship; " + "EMPIRICALLY DOMAIN-FLAWED for Spanish chat-domain — calibrated 2026-05-02 " + "against the Rutify corpus showed within-author and cross-author Hamming " + "distance distributions overlap (within median 8 bits, cross median 10 " + "bits) so this primitive ALONE cannot discriminate authors in chat-style " + "short-message corpora. Engines should weight it low until paired with " + "the larger top-200 variant or composited with character n-gram and " + "distinctive-vocabulary signatures (see siblings below). Kept in v0 for " + "calibration grids and documentary purposes.", + ), + "stylometric.function_word_distribution_top200": _hash( + notes="64-bit simhash over the 200-most-common Spanish function-word frequency " + "vector. The wider list reaches into the long tail (rare-but-individual " + "function words like `tampoco`, `aunque`, `mientras`) that carry more " + "discriminating signal in short-message chat domains. NOT YET EMITTED by " + "the v0 prototype extractor; populated when v0.2 calibration is done.", + ), + "stylometric.character_ngram_simhash": _hash( + notes="64-bit simhash over a frequency vector of character n-grams (default " + "n=3) from the author's lowercased text corpus. ORTHOGONAL to " + "function-word distributions: captures punctuation tics, accent-" + "stripping habits, typo patterns, and idiom-fragment fingerprints " + "that survive paraphrase. Lowercases input so that capitalization " + "habits — already captured by stylometric.capitalization_habit — " + "do not double-count. Accents PRESERVED because accent-stripping is " + "itself a stylistic tic worth catching. Source label declares n size " + "(e.g. `#char3gram`, `#char4gram`).", + ), + "stylometric.distinctive_vocabulary_signature": _hash( + notes="64-bit simhash over a TF-IDF-weighted top-K rare-word vector. " + "COMPLEMENTARY to function-word distributions: where function_word_* " + "captures common-word *style*, this captures the author's distinctive " + "*lexicon* (the words this person uses that other authors in the same " + "corpus do NOT). Strong against context-shift because rare words are " + "where authorial choice lives. Requires the chat corpus for IDF " + "computation, performed once per extraction. Source label declares the " + "top-K size and corpus tag (e.g. `#tfidf-top50`).", + ), + + # ── lexical.* (cognitive analog — 8) ────────────────────────────────── + "lexical.vocabulary_richness": _num( + min_val=0.0, max_val=1.0, + notes="Moving-Average Type-Token Ratio (MATTR) over a sliding window " + "(default 50 tokens). Volume-independent: each window contributes " + "its own unique/total ratio, the primitive's value is the mean. " + "Avoids the standard TTR bias where larger corpora mechanically " + "score lower. Source label declares the window size.", + ), + "lexical.slang_density": _num(min_val=0.0, max_val=1.0, + notes="rate per message; locale-tuned slang corpus"), + "lexical.code_switching_rate": _num(min_val=0.0, max_val=1.0, + notes="switches per N tokens; Solorio & Liu metric"), + "lexical.code_switching_matrix_language": _str(notes="BCP-47 of dominant language"), + "lexical.code_switching_embedded_languages": _array(ValueKind.FREE_STRING, + notes="BCP-47 list of non-matrix languages observed"), + "lexical.sentence_complexity_class": _cat("simple", "compound", "complex"), + "lexical.question_formation_style": _cat("punctuation_only", "lexical", "formal"), + "lexical.imperative_style": _cat("informal_directive", "formal_directive", "polite"), + + # ── temporal_evolution.* (lifecycle / change-over-time — 1) ─────────── + "temporal_evolution.lifecycle_phase": _cat( + "arrival_burst", "stable_member", "fluctuating_member", + "inflection_member", "declining_member", "unknown", + notes="Auto-classified lifecycle stage derived from windowed within-" + "corpus analysis. arrival_burst: tenure < 24hr with first-window " + "volume dominating later windows and high inter-window drift " + "(empirically validated 2026-05-03 against OxPayload's first 12 " + "hours on Rutify). stable_member: low drift between consecutive " + "windows across the whole tenure. fluctuating_member (added v0.3): " + "tenure ≥ 24hr with median drift in [stable_max, inflection_min) " + "and no single window crossing inflection_min — established noisy " + "regulars who don't fit clean stable/inflection classes (e.g. " + "labelled admin lamarabitch, formerly classified unknown). " + "inflection_member: long-tenure actor whose drift spikes in at " + "least one window-pair (a real behavioral shift mid-corpus). " + "declining_member: monotonically decreasing per-window message " + "counts. unknown: insufficient windowed data for classification. " + "Window size adapts to tenure: <24hr → 2h windows, <7d → 12h, " + "<30d → 1d, otherwise 7d.", + ), + + # ── network.* (governance/role-shape signals — 2, added v0.3) ───────── + "network.is_likely_bot": _cat( + "likely_bot", "not_bot", "unknown", + notes="Heuristic bot detector composited from existing primitives. " + "Classifies as likely_bot when conversation_initiation_rate ≥ 0.95 " + "AND attention_pattern = broadcast AND vocabulary_richness < 0.65. " + "Empirically validated 2026-05-03 against the tdl-labeled Rutify " + "bot SangMata_beta_bot (correctly caught) vs 11 high-volume humans " + "in the same corpus (none false-positive). NOT a verdict — engines " + "should treat as a candidate signal, especially since low-volume " + "bots (e.g. QuotLyBot with 9 messages) sit below the fingerprint " + "threshold and emit nothing here. Source label declares the " + "heuristic version (e.g. #bot-heuristic-v1).", + ), + "network.governance_role_signal": _cat( + "admin_pattern", "responder_pattern", "regular", "bot_pattern", "unknown", + notes="Heuristic role-shape composited from interaction primitives + " + "lifecycle_phase. admin_pattern: init_rate ≥ 0.80 AND attn = " + "reciprocal AND non-bot AND not arrival_burst. responder_pattern: " + "init_rate ≤ 0.45 AND attn = reciprocal. bot_pattern: matches " + "network.is_likely_bot likely_bot. regular: everything else above " + "the volume threshold. Empirically caught all 4 high-volume " + "tdl-labeled Rutify admins, sebaImlI as responder, " + "SangMata_beta_bot as bot, OxPayload/bopxcx as regular (their " + "arrival_burst lifecycle overrides the admin-shaped init_rate). " + "NOT a ground-truth admin label — kkaxlazer matches admin_pattern " + "while not formally admin, but the 2026-05-03 reply-graph cohort " + "analysis showed they're operationally embedded in the admin " + "layer (4/4 cohort signal with the top admin), so the heuristic " + "is doing the right thing.", + ), + + # ── interaction.* (temporal analog — 6) ─────────────────────────────── + "interaction.response_latency_class": _cat("immediate", "fast", "normal", "slow", "sporadic"), + "interaction.conversation_initiation_rate": _num(min_val=0.0, max_val=1.0, + notes="thread-starting messages / total"), + "interaction.message_burst_rate": _cat("single", "occasional", "habitual"), + "interaction.active_hours_class": _str(notes="UTC active-hours window summary"), + "interaction.session_duration_class": _cat("short", "medium", "long", "marathon", + notes="REUSED enum from BEHAVE-SHELL temporal.session_duration"), + "interaction.attention_pattern": _cat("broadcast", "focused", "reciprocal", + notes="from reply-graph centrality"), + + # ── content.* (operational analog — 6, EXPERIMENTAL) ────────────────── + "content.role_signal": _cat("admin", "seller", "buyer", "lurker", "newbie", + notes="EXPERIMENTAL — locale-tuned role-vocabulary classifier; " + "may be moved to a separate IOC/keyword-detection layer " + "once tested against the Rutify corpus"), + "content.transactional_language": _num(min_val=0.0, max_val=1.0, + notes="EXPERIMENTAL — rate of transactional terms; " + "locale-specific, brittle to vocabulary drift"), + "content.opsec_awareness": _num(min_val=0.0, max_val=1.0, + notes="EXPERIMENTAL — rate of security-conscious phrases; " + "HIGH FALSE-POSITIVE RISK on casual conversation about " + "deleting files / messages"), + "content.targeting_language": _array(ValueKind.FREE_STRING, + notes="EXPERIMENTAL — IOC-shaped target patterns " + "(bank names, government portals, RUT ranges, etc); " + "consider moving to dedicated IOC layer"), + "content.boasting_pattern": _cat("none", "occasional", "frequent", + notes="EXPERIMENTAL — success-claim regex; corpus-dependent"), + "content.conflict_style": _cat("aggressive", "defusing", "appellate", + notes="EXPERIMENTAL — dispute-tone classifier; needs " + "labelled training data"), +} + + +def is_known(primitive: str) -> bool: + return primitive in PRIMITIVE_REGISTRY + + +def get(primitive: str) -> ValueTypeSpec: + """Return the value-type spec for *primitive*; raise KeyError if unknown.""" + return PRIMITIVE_REGISTRY[primitive] diff --git a/BEHAVE-TEXT/json/observation.schema.json b/BEHAVE-TEXT/json/observation.schema.json new file mode 100644 index 0000000..8b1c63d --- /dev/null +++ b/BEHAVE-TEXT/json/observation.schema.json @@ -0,0 +1,144 @@ +{ + "$defs": { + "Window": { + "description": "Measurement window. For point observations, ``start_ts == end_ts``.\n\nBoth fields are epoch seconds (float). Distinct from ``Observation.ts``\n(the emission time), because a sensor may compute an observation over\na window in the past and emit it later.", + "properties": { + "end_ts": { + "description": "Window end, epoch seconds (>= start_ts)", + "title": "End Ts", + "type": "number" + }, + "start_ts": { + "description": "Window start, epoch seconds", + "title": "Start Ts", + "type": "number" + } + }, + "required": [ + "start_ts", + "end_ts" + ], + "title": "Window", + "type": "object" + } + }, + "$id": "https://behave.local/schema/text/observation/v1.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "additionalProperties": false, + "description": "Text-domain Observation: base envelope + BEHAVE-TEXT registry check.", + "properties": { + "confidence": { + "description": "Sensor's confidence in this measurement (not in any downstream verdict)", + "maximum": 1.0, + "minimum": 0.0, + "title": "Confidence", + "type": "number" + }, + "evidence_ref": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Pointer to underlying raw evidence; NEVER the evidence itself", + "title": "Evidence Ref" + }, + "id": { + "description": "UUID for dedup", + "title": "Id", + "type": "string" + }, + "identity_ref": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "AttackerIdentity UUID if the observation is pre-attributed", + "title": "Identity Ref" + }, + "primitive": { + "description": "Fully-qualified primitive path, e.g. 'motor.keystroke_cadence'", + "title": "Primitive", + "type": "string" + }, + "source": { + "description": "Canonical sensor identifier, e.g. 'decnet/sniffer/timing.py'", + "minLength": 1, + "title": "Source", + "type": "string" + }, + "ts": { + "description": "Emission timestamp, epoch seconds", + "title": "Ts", + "type": "number" + }, + "v": { + "default": 1, + "description": "Envelope schema version", + "title": "V", + "type": "integer" + }, + "value": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "items": { + "type": "number" + }, + "type": "array" + }, + { + "additionalProperties": true, + "type": "object" + } + ], + "description": "Value typed by the primitive's registry entry; see spec.primitives", + "title": "Value" + }, + "window": { + "$ref": "#/$defs/Window", + "description": "Measurement window" + } + }, + "required": [ + "primitive", + "value", + "confidence", + "window", + "source" + ], + "title": "Observation", + "type": "object" +} diff --git a/BEHAVE-TEXT/pyproject.toml b/BEHAVE-TEXT/pyproject.toml new file mode 100644 index 0000000..ed0402e --- /dev/null +++ b/BEHAVE-TEXT/pyproject.toml @@ -0,0 +1,33 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "decnet-behave-text" +version = "0.1.0" +description = "BEHAVE-TEXT — text/messaging-domain behavioral observation registry, layered on decnet-behave-core" +requires-python = ">=3.11" +license = { text = "GPL-3.0-or-later" } +authors = [{ name = "ANTI" }] +dependencies = ["pydantic>=2.6", "decnet-behave-core>=0.1.0"] + +[project.optional-dependencies] +dev = ["pytest>=8", "pytest-cov", "ruff"] + +[project.urls] +"Source" = "https://git.resacachile.cl/anti/BEHAVE" + +[tool.setuptools.packages.find] +include = ["decnet_behave_text*"] + +[tool.ruff] +line-length = 100 +target-version = "py311" + +[tool.ruff.lint] +select = ["E", "F", "I", "B", "UP"] +ignore = ["E501"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = "-q --import-mode=importlib" diff --git a/BEHAVE-TEXT/scripts/generate_schema.py b/BEHAVE-TEXT/scripts/generate_schema.py new file mode 100644 index 0000000..5c2c5e9 --- /dev/null +++ b/BEHAVE-TEXT/scripts/generate_schema.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +"""Regenerate BEHAVE-TEXT/json/observation.schema.json from the Pydantic source. + +Idempotent — CI can gate on `git diff --quiet` after running this. + +The artifact is functionally identical to BEHAVE-SHELL's (they share the same +core envelope), modulo the ``$id`` URL identifying the publishing package. +""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path + +_REPO_ROOT = Path(__file__).resolve().parent.parent +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + +from decnet_behave_text.spec.envelope import OBSERVATION_SCHEMA_VERSION, Observation # noqa: E402 + + +def build_schema() -> dict: + schema = Observation.model_json_schema() + schema["$id"] = ( + f"https://behave.local/schema/text/observation/v{OBSERVATION_SCHEMA_VERSION}.json" + ) + schema["$schema"] = "https://json-schema.org/draft/2020-12/schema" + return schema + + +def main() -> int: + schema = build_schema() + out = _REPO_ROOT / "json" / "observation.schema.json" + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(json.dumps(schema, indent=2, sort_keys=True) + "\n") + print(f"wrote {out}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/BEHAVE-TEXT/tests/test_primitives.py b/BEHAVE-TEXT/tests/test_primitives.py new file mode 100644 index 0000000..e6ceedd --- /dev/null +++ b/BEHAVE-TEXT/tests/test_primitives.py @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +"""Registry coverage tests for BEHAVE-TEXT. + +Asserts that every primitive listed in scratchpad.md's tables has exactly one +entry in PRIMITIVE_REGISTRY. Drift-detector — failing this test means +scratchpad.md and the registry have diverged. +""" + +from __future__ import annotations + +import re +from pathlib import Path + +from decnet_behave_text.spec import PRIMITIVE_REGISTRY, ValueKind + +# Primitive paths expected by scratchpad.md (hand-extracted; v0). +EXPECTED_PRIMITIVES = { + # stylometric.* (motor analog — 8) + "stylometric.punctuation_style", + "stylometric.capitalization_habit", + "stylometric.emoji_usage", + "stylometric.emoji_placement", + "stylometric.message_length_class", + "stylometric.message_length_variance_class", + "stylometric.linebreak_style", + "stylometric.typo_signature", + "stylometric.function_word_distribution_top50", + "stylometric.function_word_distribution_top200", + "stylometric.character_ngram_simhash", + "stylometric.distinctive_vocabulary_signature", + # lexical.* (cognitive analog — 8) + "lexical.vocabulary_richness", + "lexical.slang_density", + "lexical.code_switching_rate", + "lexical.code_switching_matrix_language", + "lexical.code_switching_embedded_languages", + "lexical.sentence_complexity_class", + "lexical.question_formation_style", + "lexical.imperative_style", + # temporal_evolution.* (lifecycle/change-over-time — 1, added v0.2) + "temporal_evolution.lifecycle_phase", + # network.* (governance/role-shape — 2, added v0.3) + "network.is_likely_bot", + "network.governance_role_signal", + # interaction.* (temporal analog — 6) + "interaction.response_latency_class", + "interaction.conversation_initiation_rate", + "interaction.message_burst_rate", + "interaction.active_hours_class", + "interaction.session_duration_class", + "interaction.attention_pattern", + # content.* (operational analog — 6, EXPERIMENTAL) + "content.role_signal", + "content.transactional_language", + "content.opsec_awareness", + "content.targeting_language", + "content.boasting_pattern", + "content.conflict_style", +} + + +def test_registry_covers_expected_primitives_exactly(): + registry_keys = set(PRIMITIVE_REGISTRY.keys()) + missing = EXPECTED_PRIMITIVES - registry_keys + extra = registry_keys - EXPECTED_PRIMITIVES + assert not missing, f"registry missing: {sorted(missing)}" + assert not extra, f"registry has unexpected entries: {sorted(extra)}" + + +def test_every_primitive_has_a_valid_spec(): + for primitive, spec in PRIMITIVE_REGISTRY.items(): + if spec.kind is ValueKind.CATEGORICAL: + assert spec.allowed, f"{primitive}: categorical must define `allowed`" + assert all(isinstance(v, str) for v in spec.allowed) + elif spec.kind is ValueKind.ARRAY: + assert spec.array_of is not None, f"{primitive}: array must define `array_of`" + assert spec.array_of is not ValueKind.ARRAY, ( + f"{primitive}: nested arrays not supported in v0" + ) + + +def test_primitive_paths_are_dotted_lowercase(): + pattern = re.compile(r"^[a-z][a-z0-9_]*(\.[a-z][a-z0-9_]*)+$") + for primitive in PRIMITIVE_REGISTRY: + assert pattern.match(primitive), f"malformed primitive path: {primitive!r}" + + +def test_experimental_primitives_are_in_content_layer_only(): + """`status: experimental` should be confined to content.* in v0.""" + for primitive, spec in PRIMITIVE_REGISTRY.items(): + if spec.notes and "EXPERIMENTAL" in spec.notes: + assert primitive.startswith("content."), ( + f"{primitive}: EXPERIMENTAL flag should only appear in content.* layer in v0" + ) + + +def test_topic_namespace_uses_actor_not_attacker(): + """The text-domain topic prefix must be `actor.*`, not `attacker.*`.""" + from decnet_behave_text.spec import TOPIC_PREFIX, event_topic_for + assert TOPIC_PREFIX == "actor.observation.text" + assert event_topic_for("stylometric.emoji_usage") == "actor.observation.text.stylometric.emoji_usage"