Files
BEHAVE/BEHAVE-TEXT/tests/test_primitives.py
anti b182e2fe3b feat(text): add meta.* corpus-footprint layer and 4 language-aware primitives (v0.1.3)
Adds 12 new primitives across two waves of spec work this session.

meta.* layer (8 primitives) — corpus-snapshot footprint:
  total_messages, corpus_span_days, msg_per_day, active_days,
  activity_density, first_seen_ts, last_seen_ts, fingerprint_confidence.
  Motivated by two actors with identical message counts (53 each) producing
  indistinguishable profiles despite radically different presence shapes
  (0.3-day burst vs 47-day long tail).

Language-aware characterization primitives (4 primitives):
  stylometric.pos_ngram_signature — SimHash over POS bigram frequency vector;
    syntactic skeleton fingerprint that survives full vocabulary paraphrase.
  lexical.dialect_region — BCP-47 free_string (es-CL, es-AR, es-MX, …);
    designed for EYENET integration with INGEOTEC regional-spanish-models.
  lexical.evaluative_morphology_density — diminutive/augmentative/pejorative
    suffix density; stable per-author trait baked into language acquisition.
  lexical.optional_grammar_signature — SimHash over optional-grammar choice
    points (compound/simple past, subjunctive, leísmo, relative pronoun);
    high-reliability Spain vs LatAm discriminator.

Also fixes stale scratchpad.md references throughout (README.md is now the
authority), bumps behave-text to 0.1.3, and updates CHANGELOG.
2026-05-23 01:54:12 -04:00

115 lines
4.4 KiB
Python

# SPDX-License-Identifier: GPL-3.0-or-later
"""Registry coverage tests for BEHAVE-TEXT.
Asserts that every primitive listed in README.md's tables has exactly one
entry in PRIMITIVE_REGISTRY. Drift-detector — failing this test means
README.md and the registry have diverged.
"""
from __future__ import annotations
import re
from pathlib import Path
from behave_text.spec import PRIMITIVE_REGISTRY, ValueKind
# Primitive paths expected by README.md (hand-extracted; v0).
EXPECTED_PRIMITIVES = {
# meta.* (corpus-snapshot footprint — 8)
"meta.total_messages",
"meta.corpus_span_days",
"meta.msg_per_day",
"meta.active_days",
"meta.activity_density",
"meta.first_seen_ts",
"meta.last_seen_ts",
"meta.fingerprint_confidence",
# stylometric.* (motor analog — 13)
"stylometric.punctuation_style",
"stylometric.capitalization_habit",
"stylometric.emoji_usage",
"stylometric.emoji_placement",
"stylometric.message_length_class",
"stylometric.message_length_variance_class",
"stylometric.linebreak_style",
"stylometric.typo_signature",
"stylometric.function_word_distribution_top50",
"stylometric.function_word_distribution_top200",
"stylometric.character_ngram_simhash",
"stylometric.distinctive_vocabulary_signature",
"stylometric.pos_ngram_signature",
# lexical.* (cognitive analog — 11)
"lexical.vocabulary_richness",
"lexical.slang_density",
"lexical.code_switching_rate",
"lexical.code_switching_matrix_language",
"lexical.code_switching_embedded_languages",
"lexical.sentence_complexity_class",
"lexical.question_formation_style",
"lexical.imperative_style",
"lexical.dialect_region",
"lexical.evaluative_morphology_density",
"lexical.optional_grammar_signature",
# temporal_evolution.* (lifecycle/change-over-time — 1, added v0.2)
"temporal_evolution.lifecycle_phase",
# network.* (governance/role-shape — 2, added v0.3)
"network.is_likely_bot",
"network.governance_role_signal",
# interaction.* (temporal analog — 6)
"interaction.response_latency_class",
"interaction.conversation_initiation_rate",
"interaction.message_burst_rate",
"interaction.active_hours_class",
"interaction.session_duration_class",
"interaction.attention_pattern",
# content.* (operational analog — 6, EXPERIMENTAL)
"content.role_signal",
"content.transactional_language",
"content.opsec_awareness",
"content.targeting_language",
"content.boasting_pattern",
"content.conflict_style",
}
def test_registry_covers_expected_primitives_exactly():
registry_keys = set(PRIMITIVE_REGISTRY.keys())
missing = EXPECTED_PRIMITIVES - registry_keys
extra = registry_keys - EXPECTED_PRIMITIVES
assert not missing, f"registry missing: {sorted(missing)}"
assert not extra, f"registry has unexpected entries: {sorted(extra)}"
def test_every_primitive_has_a_valid_spec():
for primitive, spec in PRIMITIVE_REGISTRY.items():
if spec.kind is ValueKind.CATEGORICAL:
assert spec.allowed, f"{primitive}: categorical must define `allowed`"
assert all(isinstance(v, str) for v in spec.allowed)
elif spec.kind is ValueKind.ARRAY:
assert spec.array_of is not None, f"{primitive}: array must define `array_of`"
assert spec.array_of is not ValueKind.ARRAY, (
f"{primitive}: nested arrays not supported in v0"
)
def test_primitive_paths_are_dotted_lowercase():
pattern = re.compile(r"^[a-z][a-z0-9_]*(\.[a-z][a-z0-9_]*)+$")
for primitive in PRIMITIVE_REGISTRY:
assert pattern.match(primitive), f"malformed primitive path: {primitive!r}"
def test_experimental_primitives_are_in_content_layer_only():
"""`status: experimental` should be confined to content.* in v0."""
for primitive, spec in PRIMITIVE_REGISTRY.items():
if spec.notes and "EXPERIMENTAL" in spec.notes:
assert primitive.startswith("content."), (
f"{primitive}: EXPERIMENTAL flag should only appear in content.* layer in v0"
)
def test_topic_namespace_uses_actor_not_attacker():
"""The text-domain topic prefix must be `actor.*`, not `attacker.*`."""
from behave_text.spec import TOPIC_PREFIX, event_topic_for
assert TOPIC_PREFIX == "actor.observation.text"
assert event_topic_for("stylometric.emoji_usage") == "actor.observation.text.stylometric.emoji_usage"