Adds 12 new primitives across two waves of spec work this session.
meta.* layer (8 primitives) — corpus-snapshot footprint:
total_messages, corpus_span_days, msg_per_day, active_days,
activity_density, first_seen_ts, last_seen_ts, fingerprint_confidence.
Motivated by two actors with identical message counts (53 each) producing
indistinguishable profiles despite radically different presence shapes
(0.3-day burst vs 47-day long tail).
Language-aware characterization primitives (4 primitives):
stylometric.pos_ngram_signature — SimHash over POS bigram frequency vector;
syntactic skeleton fingerprint that survives full vocabulary paraphrase.
lexical.dialect_region — BCP-47 free_string (es-CL, es-AR, es-MX, …);
designed for EYENET integration with INGEOTEC regional-spanish-models.
lexical.evaluative_morphology_density — diminutive/augmentative/pejorative
suffix density; stable per-author trait baked into language acquisition.
lexical.optional_grammar_signature — SimHash over optional-grammar choice
points (compound/simple past, subjunctive, leísmo, relative pronoun);
high-reliability Spain vs LatAm discriminator.
Also fixes stale scratchpad.md references throughout (README.md is now the
authority), bumps behave-text to 0.1.3, and updates CHANGELOG.
476 lines
30 KiB
Python
476 lines
30 KiB
Python
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
"""BEHAVE-TEXT primitive registry.
|
|
|
|
Source-of-truth for what `Observation.primitive` may be in the text/messaging
|
|
domain and what `Observation.value` must look like. Mirrors every row in the
|
|
primitive tables of `scratchpad.md`.
|
|
|
|
PII discipline notice (carried over from behave-core's envelope module):
|
|
TEXT-domain observations carry CATEGORICAL LABELS, AGGREGATE RATES, and
|
|
HASHES of distributions. Sensors operating on Telegram/messaging text MUST
|
|
NOT emit raw message content into BEHAVE-TEXT observations — only derived
|
|
features. The `evidence_ref` field points to the underlying message store
|
|
held elsewhere; never into the message body itself.
|
|
|
|
This is a tighter constraint than BEHAVE-SHELL's because the source signal
|
|
IS text content. Sensors must hash/aggregate before emitting.
|
|
|
|
Adding a new primitive is a deliberate registry edit. Drift between this file
|
|
and `README.md` is a bug; v0 keeps the registry hand-written so PR review
|
|
catches drift, v0.x may auto-extract from the markdown if drift becomes a
|
|
maintenance issue.
|
|
|
|
Status flags appear in the `notes` field. `EXPERIMENTAL` marks primitives in
|
|
the `content.*` layer whose detector implementations are likely brittle; an
|
|
attribution engine may choose to weight those at zero until field-validated.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from enum import Enum
|
|
from typing import Any, Optional
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
class ValueKind(str, Enum):
|
|
"""Discriminator for the shape an `Observation.value` must take."""
|
|
|
|
CATEGORICAL = "categorical"
|
|
NUMERIC = "numeric"
|
|
HASH = "hash"
|
|
ARRAY = "array"
|
|
FREE_STRING = "free_string"
|
|
BOOL = "bool"
|
|
|
|
|
|
class ValueTypeSpec(BaseModel):
|
|
"""Per-primitive value-type spec (mirrors BEHAVE-SHELL's shape)."""
|
|
|
|
kind: ValueKind
|
|
allowed: Optional[list[str]] = Field(default=None)
|
|
min_val: Optional[float] = Field(default=None)
|
|
max_val: Optional[float] = Field(default=None)
|
|
array_of: Optional[ValueKind] = Field(default=None)
|
|
notes: Optional[str] = Field(default=None)
|
|
|
|
def validate_value(self, value: Any) -> None:
|
|
if self.kind is ValueKind.CATEGORICAL:
|
|
if not isinstance(value, str):
|
|
raise ValueError(f"expected categorical string, got {type(value).__name__}")
|
|
if self.allowed is not None and value not in self.allowed:
|
|
raise ValueError(f"value {value!r} not in allowed set {self.allowed!r}")
|
|
elif self.kind is ValueKind.NUMERIC:
|
|
if isinstance(value, bool) or not isinstance(value, (int, float)):
|
|
raise ValueError(f"expected numeric, got {type(value).__name__}")
|
|
if self.min_val is not None and value < self.min_val:
|
|
raise ValueError(f"value {value} below min_val {self.min_val}")
|
|
if self.max_val is not None and value > self.max_val:
|
|
raise ValueError(f"value {value} above max_val {self.max_val}")
|
|
elif self.kind is ValueKind.HASH:
|
|
if not isinstance(value, str) or not value:
|
|
raise ValueError("expected non-empty hash string")
|
|
elif self.kind is ValueKind.FREE_STRING:
|
|
if not isinstance(value, str):
|
|
raise ValueError(f"expected string, got {type(value).__name__}")
|
|
elif self.kind is ValueKind.BOOL:
|
|
if not isinstance(value, bool):
|
|
raise ValueError(f"expected bool, got {type(value).__name__}")
|
|
elif self.kind is ValueKind.ARRAY:
|
|
if not isinstance(value, list):
|
|
raise ValueError(f"expected array, got {type(value).__name__}")
|
|
if self.array_of is None:
|
|
return
|
|
element_spec = ValueTypeSpec(kind=self.array_of)
|
|
for i, element in enumerate(value):
|
|
try:
|
|
element_spec.validate_value(element)
|
|
except ValueError as exc:
|
|
raise ValueError(f"array element [{i}]: {exc}") from None
|
|
|
|
|
|
# ─── Convenience constructors ───────────────────────────────────────────────
|
|
|
|
def _cat(*allowed: str, notes: Optional[str] = None) -> ValueTypeSpec:
|
|
return ValueTypeSpec(kind=ValueKind.CATEGORICAL, allowed=list(allowed), notes=notes)
|
|
|
|
def _num(min_val: Optional[float] = None, max_val: Optional[float] = None, notes: Optional[str] = None) -> ValueTypeSpec:
|
|
return ValueTypeSpec(kind=ValueKind.NUMERIC, min_val=min_val, max_val=max_val, notes=notes)
|
|
|
|
def _hash(notes: Optional[str] = None) -> ValueTypeSpec:
|
|
return ValueTypeSpec(kind=ValueKind.HASH, notes=notes)
|
|
|
|
def _str(notes: Optional[str] = None) -> ValueTypeSpec:
|
|
return ValueTypeSpec(kind=ValueKind.FREE_STRING, notes=notes)
|
|
|
|
def _array(of: ValueKind, notes: Optional[str] = None) -> ValueTypeSpec:
|
|
return ValueTypeSpec(kind=ValueKind.ARRAY, array_of=of, notes=notes)
|
|
|
|
|
|
# ─── The registry ───────────────────────────────────────────────────────────
|
|
#
|
|
# 47 primitives across 7 layers. Mirrors README.md row-for-row.
|
|
|
|
PRIMITIVE_REGISTRY: dict[str, ValueTypeSpec] = {
|
|
# ── meta.* (corpus-snapshot footprint — 8) ────────────────────────────
|
|
"meta.total_messages": _num(
|
|
min_val=0.0,
|
|
notes="Raw message count for this actor in the corpus snapshot. Integer in "
|
|
"practice; stored as float for spec uniformity. Dependency anchor: "
|
|
"msg_per_day is derived from this; fingerprint_confidence is informed "
|
|
"by this. Emit before deriving rates.",
|
|
),
|
|
"meta.corpus_span_days": _num(
|
|
min_val=0.0,
|
|
notes="Wall-clock duration in fractional days between the actor's earliest "
|
|
"and latest message in the corpus snapshot. First-to-last only — blind "
|
|
"to silence in between (a 47-day span with 5 active days still yields "
|
|
"47). Complement with active_days and activity_density to get presence "
|
|
"shape. Recomputable from first_seen_ts and last_seen_ts.",
|
|
),
|
|
"meta.msg_per_day": _num(
|
|
min_val=0.0,
|
|
notes="total_messages / corpus_span_days. The key rate that separates a "
|
|
"bursty single-session visitor (53 msgs in 0.3 days → 53/day) from a "
|
|
"long-tail lurker (53 msgs in 47 days → 1.1/day). Undefined when "
|
|
"corpus_span_days = 0; extractors should emit null/omit rather than "
|
|
"divide-by-zero in that edge case.",
|
|
),
|
|
"meta.active_days": _num(
|
|
min_val=0.0,
|
|
notes="Count of distinct calendar days (UTC) on which the actor sent at "
|
|
"least one message. Always ≤ corpus_span_days. An actor with span=47 "
|
|
"and active_days=3 is a periodic visitor who appears rarely; one with "
|
|
"span=47 and active_days=40 is a near-daily regular. Use alongside "
|
|
"activity_density for full presence shape.",
|
|
),
|
|
"meta.activity_density": _num(
|
|
min_val=0.0, max_val=1.0,
|
|
notes="active_days / corpus_span_days. Single scalar capturing 'how filled "
|
|
"is the span?'. 1.0 = present every day of the window. Near-0 = "
|
|
"appeared once or twice across a long window. Undefined when "
|
|
"corpus_span_days = 0; emit null/omit for single-day actors.",
|
|
),
|
|
"meta.first_seen_ts": _str(
|
|
notes="ISO 8601 timestamp (with UTC offset, e.g. '2025-11-03T14:22:07+00:00') "
|
|
"of the actor's earliest message in the corpus snapshot. Combined with "
|
|
"last_seen_ts, this anchors corpus_span_days in absolute time so "
|
|
"observations from different extractions can be compared temporally.",
|
|
),
|
|
"meta.last_seen_ts": _str(
|
|
notes="ISO 8601 timestamp (with UTC offset, e.g. '2025-12-20T09:11:43+00:00') "
|
|
"of the actor's latest message in the corpus snapshot. See first_seen_ts.",
|
|
),
|
|
"meta.fingerprint_confidence": _cat(
|
|
"low", "medium", "high",
|
|
notes="Qualitative reliability rating for this actor's full fingerprint. "
|
|
"An attribution engine should weight all other observations from this "
|
|
"actor proportionally to this value before compositing. Derivation is "
|
|
"EXTRACTOR-DEFINED — the registry specifies the semantic contract, not "
|
|
"the formula. Extractors must declare their heuristic in the source "
|
|
"label (e.g. '#confidence-v1'). Typical inputs: total_messages, "
|
|
"corpus_span_days, active_days, and any domain-specific thresholds "
|
|
"the extractor authors have calibrated.",
|
|
),
|
|
|
|
# ── stylometric.* (motor analog — 13) ─────────────────────────────────
|
|
"stylometric.punctuation_style": _hash(notes="canonical punctuation-pattern fingerprint"),
|
|
"stylometric.capitalization_habit": _cat(
|
|
"lowercase", "proper", "random_caps", "mixed_i",
|
|
notes="Dominant capitalization rule the author applies. lowercase=no capitals except "
|
|
"after sentence breaks. proper=standard title/sentence case. random_caps=no "
|
|
"consistent rule. mixed_i=author consistently writes 'i' in lowercase even "
|
|
"mid-sentence — common in Spanish chat where 'I' is not a standalone word "
|
|
"but the habit transfers from the native language's lowercase 'yo'.",
|
|
),
|
|
"stylometric.emoji_usage": _cat(
|
|
"none", "occasional", "frequent", "exclusive",
|
|
notes="Rate of emoji use per message. exclusive=messages rarely contain text without "
|
|
"emoji. This captures tone and register — heavy emoji use in a criminal-market "
|
|
"context is a distinct style trait worth preserving.",
|
|
),
|
|
"stylometric.emoji_placement": _cat(
|
|
"pre_punctuation", "post_punctuation", "no_punctuation", "mixed",
|
|
notes="Where emojis appear relative to sentence-ending punctuation. "
|
|
"pre_punctuation='Hola 😊.' post_punctuation='Hola. 😊' "
|
|
"Individual authors are strikingly consistent in this micro-habit.",
|
|
),
|
|
"stylometric.message_length_class": _cat(
|
|
"short", "medium", "long", "paragraph",
|
|
notes="Median message length bucket: short=1-5 words, medium=6-20 words, "
|
|
"long=21-50 words, paragraph=>50 words. See also "
|
|
"stylometric.message_length_variance_class for the distribution shape.",
|
|
),
|
|
"stylometric.message_length_variance_class": _cat(
|
|
"tight", "varied", "bimodal",
|
|
notes="Coefficient of variation of per-message word counts. Captures "
|
|
"DISTRIBUTION SHAPE that message_length_class collapses by "
|
|
"emitting only the median bucket. Two authors can share the same "
|
|
"median length but have wildly different variance: `tight` (CV<0.5) "
|
|
"= consistent (always 1-3 words), `varied` (0.5<=CV<1.5) = normal "
|
|
"mix, `bimodal` (CV>=1.5) = long-tail (mostly short with occasional "
|
|
"rants). Added in v0.2 after Rutify calibration found median-only "
|
|
"bucketing discarded most of the per-author variance signal.",
|
|
),
|
|
"stylometric.linebreak_style": _cat(
|
|
"single_thought", "multi_line", "wall_of_text",
|
|
notes="Whether the author sends one complete thought per message or breaks a single "
|
|
"statement into multiple sequential short messages. multi_line=habitual "
|
|
"message-burst style (sends 3-5 short messages in rapid succession instead "
|
|
"of one composed message). wall_of_text=rarely uses line breaks, sends dense "
|
|
"blocks. Captures a stylistic rhythm that is hard to consciously alter.",
|
|
),
|
|
"stylometric.typo_signature": _hash(notes="sha256 of canonical persistent-typo set"),
|
|
"stylometric.function_word_distribution_top50": _hash(
|
|
notes="64-bit simhash over the 50-most-common Spanish function-word frequency "
|
|
"vector. Mosteller-Wallace gold standard for English long-form authorship; "
|
|
"EMPIRICALLY DOMAIN-FLAWED for Spanish chat-domain — calibrated 2026-05-02 "
|
|
"against the Rutify corpus showed within-author and cross-author Hamming "
|
|
"distance distributions overlap (within median 8 bits, cross median 10 "
|
|
"bits) so this primitive ALONE cannot discriminate authors in chat-style "
|
|
"short-message corpora. Engines should weight it low until paired with "
|
|
"the larger top-200 variant or composited with character n-gram and "
|
|
"distinctive-vocabulary signatures (see siblings below). Kept in v0 for "
|
|
"calibration grids and documentary purposes.",
|
|
),
|
|
"stylometric.function_word_distribution_top200": _hash(
|
|
notes="64-bit simhash over the 200-most-common Spanish function-word frequency "
|
|
"vector. The wider list reaches into the long tail (rare-but-individual "
|
|
"function words like `tampoco`, `aunque`, `mientras`) that carry more "
|
|
"discriminating signal in short-message chat domains. NOT YET EMITTED by "
|
|
"the v0 prototype extractor; populated when v0.2 calibration is done.",
|
|
),
|
|
"stylometric.character_ngram_simhash": _hash(
|
|
notes="64-bit simhash over a frequency vector of character n-grams (default "
|
|
"n=3) from the author's lowercased text corpus. ORTHOGONAL to "
|
|
"function-word distributions: captures punctuation tics, accent-"
|
|
"stripping habits, typo patterns, and idiom-fragment fingerprints "
|
|
"that survive paraphrase. Lowercases input so that capitalization "
|
|
"habits — already captured by stylometric.capitalization_habit — "
|
|
"do not double-count. Accents PRESERVED because accent-stripping is "
|
|
"itself a stylistic tic worth catching. Source label declares n size "
|
|
"(e.g. `#char3gram`, `#char4gram`).",
|
|
),
|
|
"stylometric.distinctive_vocabulary_signature": _hash(
|
|
notes="64-bit simhash over a TF-IDF-weighted top-K rare-word vector. "
|
|
"COMPLEMENTARY to function-word distributions: where function_word_* "
|
|
"captures common-word *style*, this captures the author's distinctive "
|
|
"*lexicon* (the words this person uses that other authors in the same "
|
|
"corpus do NOT). Strong against context-shift because rare words are "
|
|
"where authorial choice lives. Requires the chat corpus for IDF "
|
|
"computation, performed once per extraction. Source label declares the "
|
|
"top-K size and corpus tag (e.g. `#tfidf-top50`).",
|
|
),
|
|
"stylometric.pos_ngram_signature": _hash(
|
|
notes="64-bit simhash over a POS n-gram (default bigram) frequency vector "
|
|
"from the author's text corpus. Captures syntactic skeleton independent "
|
|
"of vocabulary — an author can change every word they use and still "
|
|
"retain the same POS-bigram fingerprint. ORTHOGONAL to character_ngram "
|
|
"and function_word distributions: those capture surface form, this "
|
|
"captures grammatical structure. Example signal: consistent ADJ-NOUN vs "
|
|
"NOUN-ADJ ordering in Spanish, habitual ADV-VERB pre-position. "
|
|
"TAGGER-DEPENDENT: source label MUST declare the tagger, language model, "
|
|
"and n value (e.g. `#spacy-es_core_news_sm-bi` for spaCy Spanish "
|
|
"small model, bigrams). Calibration note: chat-domain text is noisy — "
|
|
"abbreviations, misspellings, and code-switching cause tagger errors "
|
|
"that introduce fingerprint noise. Engines should weight low until "
|
|
"calibrated against labelled chat corpora.",
|
|
),
|
|
|
|
# ── lexical.* (cognitive analog — 11) ─────────────────────────────────
|
|
"lexical.vocabulary_richness": _num(
|
|
min_val=0.0, max_val=1.0,
|
|
notes="Moving-Average Type-Token Ratio (MATTR) over a sliding window "
|
|
"(default 50 tokens). Volume-independent: each window contributes "
|
|
"its own unique/total ratio, the primitive's value is the mean. "
|
|
"Avoids the standard TTR bias where larger corpora mechanically "
|
|
"score lower. Source label declares the window size.",
|
|
),
|
|
"lexical.slang_density": _num(min_val=0.0, max_val=1.0,
|
|
notes="rate per message; locale-tuned slang corpus"),
|
|
"lexical.code_switching_rate": _num(min_val=0.0, max_val=1.0,
|
|
notes="switches per N tokens; Solorio & Liu metric"),
|
|
"lexical.code_switching_matrix_language": _str(notes="BCP-47 of dominant language"),
|
|
"lexical.code_switching_embedded_languages": _array(ValueKind.FREE_STRING,
|
|
notes="BCP-47 list of non-matrix languages observed"),
|
|
"lexical.sentence_complexity_class": _cat(
|
|
"simple", "compound", "complex",
|
|
notes="Dominant clause structure. simple=single-clause messages (no conjunctions "
|
|
"or subordination). compound=two independent clauses joined by coordinating "
|
|
"conjunctions (pero, y, o, ni). complex=dependent clauses and subordination "
|
|
"(aunque, porque, cuando, que + verb). Reflects education level and "
|
|
"cognitive investment in message composition.",
|
|
),
|
|
"lexical.question_formation_style": _cat(
|
|
"punctuation_only", "lexical", "formal",
|
|
notes="How questions are formed. punctuation_only=question mark appended without "
|
|
"interrogative words ('¿Cuánto?' or 'Mañana?') — very common in Spanish "
|
|
"chat. lexical=explicit interrogatives (¿qué, cómo, cuándo, dónde). "
|
|
"formal=inverted subject-verb order or formal register ('¿Podría usted...'). "
|
|
"Captures register and education level.",
|
|
),
|
|
"lexical.imperative_style": _cat(
|
|
"informal_directive", "formal_directive", "polite",
|
|
notes="How commands and requests are framed. informal_directive=tú/vos imperative "
|
|
"('dame', 'hazlo', 'mándame'). formal_directive=usted imperative "
|
|
"('hágame el favor', 'envíeme'). polite=conditional or modal softening "
|
|
"('¿podría...?', 'me gustaría...'). Stable per-author trait in criminal "
|
|
"market contexts where hierarchical and peer relationships are expressed "
|
|
"through register choice.",
|
|
),
|
|
"lexical.dialect_region": _str(
|
|
notes="Dominant regional variety of the actor's matrix language, expressed as "
|
|
"a BCP-47 language-region tag (e.g. `es-CL`, `es-AR`, `es-MX`, `es-ES`, "
|
|
"`en-US`). Detected from lexical marker density against per-region "
|
|
"vocabulary tables; detection method and marker set version declared in "
|
|
"source label (e.g. `#dialect-markers-v1`). Emit the literal string "
|
|
"`unknown` when the extractor falls below its confidence threshold — do "
|
|
"not omit the observation, so downstream engines can distinguish "
|
|
"'undetected' from 'not extracted'. Language-agnostic in concept; the "
|
|
"marker vocabulary is language-specific. COMPLEMENTARY to "
|
|
"lexical.code_switching_matrix_language, which captures the dominant "
|
|
"language via switching analysis rather than direct regional-marker lookup.",
|
|
),
|
|
"lexical.evaluative_morphology_density": _num(
|
|
min_val=0.0, max_val=1.0,
|
|
notes="Rate of evaluative morpheme tokens / total tokens. Evaluative morphology "
|
|
"encompasses suffixes that add expressive/emotional loading to a stem: "
|
|
"diminutives (`-ito`/`-ita`/`-cito`/`-cita` — affection, minimization, "
|
|
"softening), augmentatives (`-ón`/`-ona`/`-ote`/`-ota` — intensification), "
|
|
"pejoratives (`-ejo`/`-eja`/`-ucho`/`-ucha` — contempt), and intensives "
|
|
"(`-azo`/`-aza` — force or admiration by context). Heavy diminutive use "
|
|
"is characteristic of Mexican and Central American Spanish; River Plate "
|
|
"speakers use them significantly less. The density is stable per-author "
|
|
"and hard to consciously suppress — it is baked into language acquisition. "
|
|
"Language-agnostic in concept; detection (suffix rules or morphological "
|
|
"analyser) is language-specific. Source label declares the morpheme set "
|
|
"and tool version (e.g. `#eval-morph-es-v1`).",
|
|
),
|
|
"lexical.optional_grammar_signature": _hash(
|
|
notes="64-bit simhash over a vector of the author's preference probabilities "
|
|
"at optional-grammar choice points — positions where the language offers "
|
|
"multiple grammatically correct options and individual authors make stable "
|
|
"idiosyncratic choices. For Spanish: compound past vs simple past ratio "
|
|
"(`he comido` vs `comí` — Spain strongly prefers compound for recent "
|
|
"actions; Latin America almost universally uses simple past, making this "
|
|
"a high-reliability Spain/LatAm discriminator), subjunctive usage rate "
|
|
"(avoidance correlates with informal register or non-native acquisition), "
|
|
"leísmo/laísmo/loísmo clitic patterns (`le vi` vs `lo vi` for masculine "
|
|
"accusative — leísmo is characteristic of Castilian Spain), and relative "
|
|
"pronoun choice (`que` vs `el cual/la cual` — register marker). Each "
|
|
"choice point is a scalar [0,1] probability; the simhash is computed over "
|
|
"the concatenated vector. EXTRACTOR-DEFINED: choice-point set declared in "
|
|
"source label (e.g. `#optgrammar-es-v1`). Requires sufficient corpus "
|
|
"volume for stable probability estimates — thin corpora produce noisy "
|
|
"hashes; engines should gate on meta.fingerprint_confidence before use.",
|
|
),
|
|
|
|
# ── temporal_evolution.* (lifecycle / change-over-time — 1) ───────────
|
|
"temporal_evolution.lifecycle_phase": _cat(
|
|
"arrival_burst", "stable_member", "fluctuating_member",
|
|
"inflection_member", "declining_member", "unknown",
|
|
notes="Auto-classified lifecycle stage derived from windowed within-"
|
|
"corpus analysis. arrival_burst: tenure < 24hr with first-window "
|
|
"volume dominating later windows and high inter-window drift "
|
|
"(empirically validated 2026-05-03 against OxPayload's first 12 "
|
|
"hours on Rutify). stable_member: low drift between consecutive "
|
|
"windows across the whole tenure. fluctuating_member (added v0.3): "
|
|
"tenure ≥ 24hr with median drift in [stable_max, inflection_min) "
|
|
"and no single window crossing inflection_min — established noisy "
|
|
"regulars who don't fit clean stable/inflection classes (e.g. "
|
|
"labelled admin lamarabitch, formerly classified unknown). "
|
|
"inflection_member: long-tenure actor whose drift spikes in at "
|
|
"least one window-pair (a real behavioral shift mid-corpus). "
|
|
"declining_member: monotonically decreasing per-window message "
|
|
"counts. unknown: insufficient windowed data for classification. "
|
|
"Window size adapts to tenure: <24hr → 2h windows, <7d → 12h, "
|
|
"<30d → 1d, otherwise 7d.",
|
|
),
|
|
|
|
# ── network.* (governance/role-shape signals — 2, added v0.3) ─────────
|
|
"network.is_likely_bot": _cat(
|
|
"likely_bot", "not_bot", "unknown",
|
|
notes="Heuristic bot detector composited from existing primitives. "
|
|
"Classifies as likely_bot when conversation_initiation_rate ≥ 0.95 "
|
|
"AND attention_pattern = broadcast AND vocabulary_richness < 0.65. "
|
|
"Empirically validated 2026-05-03 against the tdl-labeled Rutify "
|
|
"bot SangMata_beta_bot (correctly caught) vs 11 high-volume humans "
|
|
"in the same corpus (none false-positive). NOT a verdict — engines "
|
|
"should treat as a candidate signal, especially since low-volume "
|
|
"bots (e.g. QuotLyBot with 9 messages) sit below the fingerprint "
|
|
"threshold and emit nothing here. Source label declares the "
|
|
"heuristic version (e.g. #bot-heuristic-v1).",
|
|
),
|
|
"network.governance_role_signal": _cat(
|
|
"admin_pattern", "responder_pattern", "regular", "bot_pattern", "unknown",
|
|
notes="Heuristic role-shape composited from interaction primitives + "
|
|
"lifecycle_phase. admin_pattern: init_rate ≥ 0.80 AND attn = "
|
|
"reciprocal AND non-bot AND not arrival_burst. responder_pattern: "
|
|
"init_rate ≤ 0.45 AND attn = reciprocal. bot_pattern: matches "
|
|
"network.is_likely_bot likely_bot. regular: everything else above "
|
|
"the volume threshold. Empirically caught all 4 high-volume "
|
|
"tdl-labeled Rutify admins, sebaImlI as responder, "
|
|
"SangMata_beta_bot as bot, OxPayload/bopxcx as regular (their "
|
|
"arrival_burst lifecycle overrides the admin-shaped init_rate). "
|
|
"NOT a ground-truth admin label — kkaxlazer matches admin_pattern "
|
|
"while not formally admin, but the 2026-05-03 reply-graph cohort "
|
|
"analysis showed they're operationally embedded in the admin "
|
|
"layer (4/4 cohort signal with the top admin), so the heuristic "
|
|
"is doing the right thing.",
|
|
),
|
|
|
|
# ── interaction.* (temporal analog — 6) ───────────────────────────────
|
|
"interaction.response_latency_class": _cat(
|
|
"immediate", "fast", "normal", "slow", "sporadic",
|
|
notes="How quickly the actor responds to messages directed at them. "
|
|
"immediate=<30s (suggests active monitoring or automated response). "
|
|
"fast=30s-5min. normal=5-60min (typical async chat). slow=1-24hr. "
|
|
"sporadic=no consistent response latency — appears and disappears.",
|
|
),
|
|
"interaction.conversation_initiation_rate": _num(min_val=0.0, max_val=1.0,
|
|
notes="thread-starting messages / total"),
|
|
"interaction.message_burst_rate": _cat(
|
|
"single", "occasional", "habitual",
|
|
notes="Whether the actor sends multiple messages in rapid sequence within a "
|
|
"conversation turn. habitual=almost always bursts (sends 3+ messages "
|
|
"before any reply). single=almost always one message per turn. Tied to "
|
|
"stylometric.linebreak_style multi_line.",
|
|
),
|
|
"interaction.active_hours_class": _str(notes="UTC active-hours window summary"),
|
|
"interaction.session_duration_class": _cat("short", "medium", "long", "marathon",
|
|
notes="REUSED enum from BEHAVE-SHELL temporal.session_duration"),
|
|
"interaction.attention_pattern": _cat("broadcast", "focused", "reciprocal",
|
|
notes="from reply-graph centrality"),
|
|
|
|
# ── content.* (operational analog — 6, EXPERIMENTAL) ──────────────────
|
|
"content.role_signal": _cat("admin", "seller", "buyer", "lurker", "newbie",
|
|
notes="EXPERIMENTAL — locale-tuned role-vocabulary classifier; "
|
|
"may be moved to a separate IOC/keyword-detection layer "
|
|
"once tested against the Rutify corpus"),
|
|
"content.transactional_language": _num(min_val=0.0, max_val=1.0,
|
|
notes="EXPERIMENTAL — rate of transactional terms; "
|
|
"locale-specific, brittle to vocabulary drift"),
|
|
"content.opsec_awareness": _num(min_val=0.0, max_val=1.0,
|
|
notes="EXPERIMENTAL — rate of security-conscious phrases; "
|
|
"HIGH FALSE-POSITIVE RISK on casual conversation about "
|
|
"deleting files / messages"),
|
|
"content.targeting_language": _array(ValueKind.FREE_STRING,
|
|
notes="EXPERIMENTAL — IOC-shaped target patterns "
|
|
"(bank names, government portals, RUT ranges, etc); "
|
|
"consider moving to dedicated IOC layer"),
|
|
"content.boasting_pattern": _cat("none", "occasional", "frequent",
|
|
notes="EXPERIMENTAL — success-claim regex; corpus-dependent"),
|
|
"content.conflict_style": _cat("aggressive", "defusing", "appellate",
|
|
notes="EXPERIMENTAL — dispute-tone classifier; needs "
|
|
"labelled training data"),
|
|
}
|
|
|
|
|
|
def is_known(primitive: str) -> bool:
|
|
return primitive in PRIMITIVE_REGISTRY
|
|
|
|
|
|
def get(primitive: str) -> ValueTypeSpec:
|
|
"""Return the value-type spec for *primitive*; raise KeyError if unknown."""
|
|
return PRIMITIVE_REGISTRY[primitive]
|