# Copyright 2022 - 2025 The PyMC Labs Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Causal module."""
from __future__ import annotations
import itertools as it
import re
import warnings
from collections.abc import Sequence
from typing import Annotated, Literal, NotRequired, TypedDict
try:
import networkx as nx
except ImportError: # Optional dependency
nx = None # type: ignore[assignment]
import numpy as np
import pandas as pd
import pymc as pm
import pytensor
import pytensor.tensor as pt
from pydantic import Field, InstanceOf, validate_call
from pymc_extras.prior import Prior
try:
from dowhy import CausalModel
except ImportError:
class LazyCausalModel:
"""Lazy import of dowhy's CausalModel."""
def __init__(self, *args, **kwargs):
msg = (
"To use Causal Graph functionality, please install the optional dependencies with: "
"pip install pymc-marketing[dag]"
)
raise ImportError(msg)
CausalModel = LazyCausalModel
[docs]
class TestResult(TypedDict):
"""Conditional independence test statistics recorded during fitting."""
bic0: float
bic1: float
delta_bic: float
logBF10: float
BF10: float
independent: bool
conditioning_set: list[str]
forced: NotRequired[bool]
EMPTY_CONDITION_SET: frozenset[str] = frozenset()
[docs]
class BuildModelFromDAG:
"""Build a PyMC probabilistic model directly from a Causal DAG and a tabular dataset.
The class interprets a Directed Acyclic Graph (DAG) where each node is a column
in the provided `df`. For every edge ``A -> B`` it creates a slope prior for
the contribution of ``A`` into the mean of ``B``. Each node receives a
likelihood prior. Dims and coords are used to align and index observed data
via ``pm.Data`` and xarray.
Parameters
----------
dag : str
DAG in DOT format (e.g. ``digraph { A -> B; B -> C; }``) or as a simple
comma/newline separated list of edges (e.g. ``"A->B, B->C"``).
df : pandas.DataFrame
DataFrame that contains a column for every node present in the DAG and
all columns named by the provided ``dims``.
target : str
Name of the target node present in both the DAG and ``df``. This is not
used to restrict modeling but is validated to exist in the DAG.
dims : tuple[str, ...]
Dims for the observed variables and likelihoods (e.g. ``("date", "channel")``).
coords : dict
Mapping from dim names to coordinate values. All coord keys must exist as
columns in ``df`` and will be used to pivot the data to match dims.
model_config : dict, optional
Optional configuration with priors for keys ``"intercept"``, ``"slope"`` and
``"likelihood"``. Values should be ``pymc_extras.prior.Prior`` instances.
Missing keys fall back to :pyattr:`default_model_config`.
Examples
--------
Minimal example using DOT format:
.. code-block:: python
import numpy as np
import pandas as pd
from pymc_marketing.mmm.causal import BuildModelFromDAG
dates = pd.date_range("2024-01-01", periods=5, freq="D")
df = pd.DataFrame(
{
"date": dates,
"X": np.random.normal(size=5),
"Y": np.random.normal(size=5),
}
)
dag = "digraph { X -> Y; }"
dims = ("date",)
coords = {"date": dates}
builder = BuildModelFromDAG(
dag=dag, df=df, target="Y", dims=dims, coords=coords
)
model = builder.build()
Edge-list format and custom likelihood prior:
.. code-block:: python
from pymc_extras.prior import Prior
dag = "X->Y" # equivalent to the DOT example above
model_config = {
"likelihood": Prior(
"StudentT", nu=5, sigma=Prior("HalfNormal", sigma=1), dims=("date",)
),
}
builder = BuildModelFromDAG(
dag=dag,
df=df,
target="Y",
dims=("date",),
coords={"date": dates},
model_config=model_config,
)
model = builder.build()
"""
[docs]
@validate_call
def __init__(
self,
*,
dag: str = Field(..., description="DAG in DOT string format or A->B list"),
df: InstanceOf[pd.DataFrame] = Field(
..., description="DataFrame containing all DAG node columns"
),
target: str = Field(..., description="Target node name present in DAG and df"),
dims: tuple[str, ...] = Field(
..., description="Dims for observed/likelihood variables"
),
coords: dict = Field(
...,
description=(
"Required coords mapping for dims and priors. All coord keys must exist as columns in df."
),
),
model_config: dict | None = Field(
None,
description=(
"Optional model config with Priors for 'intercept', 'slope' and "
"'likelihood'. Keys not supplied fall back to defaults."
),
),
) -> None:
self.dag = dag
self.df = df
self.target = target
self.dims = dims
self.coords = coords
# Parse graph and validate target
self.graph = self._parse_dag(self.dag)
self.nodes = list(nx.topological_sort(self.graph))
if self.target not in self.nodes:
raise ValueError(f"Target '{self.target}' not in DAG nodes: {self.nodes}")
# Merge provided model_config with defaults
provided = model_config
self.model_config = self.default_model_config
if provided is not None:
self.model_config.update(provided)
# Validate required priors are present and of correct type
self._validate_model_config_priors()
# Validate coords are present and consistent with dims, priors, and df
self._validate_coords_required_are_consistent()
# Validate prior dims consistency early (does not require building the model)
self._warning_if_slope_dims_dont_match_likelihood_dims()
self._validate_intercept_dims_match_slope_dims()
@property
def default_model_config(self) -> dict[str, Prior]:
"""Default priors for intercepts, slopes and likelihood using ``pymc_extras.Prior``.
Returns
-------
dict
Dictionary with keys ``"intercept"``, ``"slope"`` and ``"likelihood"``
mapping to ``Prior`` instances with dims derived from
:pyattr:`dims`.
"""
slope_dims = tuple(dim for dim in (self.dims or ()) if dim != "date")
return {
"intercept": Prior("Normal", mu=0, sigma=2, dims=slope_dims),
"slope": Prior("Normal", mu=0, sigma=2, dims=slope_dims),
"likelihood": Prior(
"Normal",
sigma=Prior("HalfNormal", sigma=2),
dims=self.dims,
),
}
@staticmethod
def _parse_dag(dag_str: str) -> nx.DiGraph:
"""Parse DOT digraph or edge-list string into a directed acyclic graph."""
if nx is None:
raise ImportError(
"To use Causal Graph functionality, please install the optional dependencies with: "
"pip install pymc-marketing[dag]"
)
# Primary format: DOT digraph
s = dag_str.strip()
g = nx.DiGraph()
if s.lower().startswith("digraph"):
# Extract content within the first top-level {...}
brace_start = s.find("{")
brace_end = s.rfind("}")
if brace_start == -1 or brace_end == -1 or brace_end <= brace_start:
raise ValueError("Malformed DOT digraph: missing braces")
body = s[brace_start + 1 : brace_end]
# Remove comments (// ... or # ... at line end)
lines = []
for raw_line in body.splitlines():
line = re.split(r"//|#", raw_line, maxsplit=1)[0].strip()
if line:
lines.append(line)
body = "\n".join(lines)
# Find edges "A -> B" possibly ending with ';'
for m in re.finditer(
r"\b([A-Za-z0-9_]+)\s*->\s*([A-Za-z0-9_]+)\s*;?", body
):
a, b = m.group(1), m.group(2)
g.add_edge(a, b)
# Find standalone node declarations (lines with single identifier, optional ';')
for raw_line in body.splitlines():
line = raw_line.strip().rstrip(";")
if not line or "->" in line or "[" in line or "]" in line:
continue
mnode = re.match(r"^([A-Za-z0-9_]+)$", line)
if mnode:
g.add_node(mnode.group(1))
else:
# Fallback: simple comma/newline-separated "A->B" tokens
edges: list[tuple[str, str]] = []
for token in re.split(r"[,\n]+", s):
token = token.strip().rstrip(";")
if not token:
continue
medge = re.match(r"^([A-Za-z0-9_]+)\s*->\s*([A-Za-z0-9_]+)$", token)
if not medge:
raise ValueError(f"Invalid edge token: '{token}'")
a, b = medge.group(1), medge.group(2)
edges.append((a, b))
g.add_edges_from(edges)
if not nx.is_directed_acyclic_graph(g):
raise ValueError("Provided graph is not a DAG.")
return g
def _warning_if_slope_dims_dont_match_likelihood_dims(self) -> None:
"""Warn if slope prior dims differ from likelihood dims without the 'date' dim."""
slope_prior = self.model_config["slope"]
likelihood_prior = self.model_config["likelihood"]
like_dims = getattr(likelihood_prior, "dims", None)
if isinstance(like_dims, str):
like_dims = (like_dims,)
elif isinstance(like_dims, list):
like_dims = tuple(like_dims)
# Guard against None dims (treat as empty)
if like_dims is None:
expected_slope_dims = ()
else:
expected_slope_dims = tuple(dim for dim in like_dims if dim != "date")
slope_dims = getattr(slope_prior, "dims", None)
if slope_dims is None or not isinstance(slope_dims, tuple):
slope_dims = ()
elif isinstance(slope_dims, str):
slope_dims = (slope_dims,)
elif isinstance(slope_dims, list):
slope_dims = tuple(slope_dims)
if slope_dims != expected_slope_dims:
warnings.warn(
(
"Slope prior dims "
f"{slope_dims if slope_dims else '()'} do not match expected dims "
f"{expected_slope_dims} (likelihood dims without 'date')."
),
stacklevel=2,
)
def _validate_intercept_dims_match_slope_dims(self) -> None:
"""Ensure intercept prior dims match slope prior dims exactly."""
def _to_tuple(maybe_dims):
if maybe_dims is None:
return tuple()
if isinstance(maybe_dims, str):
return (maybe_dims,)
if isinstance(maybe_dims, list | tuple):
return tuple(maybe_dims)
return tuple()
slope_dims = _to_tuple(getattr(self.model_config["slope"], "dims", None))
intercept_dims = _to_tuple(
getattr(self.model_config["intercept"], "dims", None)
)
if slope_dims != intercept_dims:
raise ValueError(
"model_config['intercept'].dims must match model_config['slope'].dims. "
f"Got intercept dims {intercept_dims or '()'} and slope dims {slope_dims or '()'}."
)
def _validate_model_config_priors(self) -> None:
"""Ensure required model_config entries are Prior instances.
Enforces that keys 'slope' and 'likelihood' exist and are Prior objects,
so downstream code can safely index and call Prior helper methods.
"""
required_keys = ("intercept", "slope", "likelihood")
for key in required_keys:
if key not in self.model_config:
raise ValueError(f"model_config must include '{key}' as a Prior.")
for key in required_keys:
if not isinstance(self.model_config[key], Prior):
raise TypeError(
f"model_config['{key}'] must be a Prior, got "
f"{type(self.model_config[key]).__name__}."
)
def _validate_coords_required_are_consistent(self) -> None:
"""Validate mutual consistency among dims, coords, priors, and data columns."""
if self.coords is None:
raise ValueError("'coords' is required and cannot be None.")
# 1) All coords keys must correspond to columns in the dataset
for key in self.coords.keys():
if key not in self.df.columns:
raise KeyError(
f"Coordinate key '{key}' not found in DataFrame columns. Present columns: {list(self.df.columns)}"
)
# 2) Ensure dims are present in coords
for d in self.dims:
if d not in self.coords:
raise ValueError(f"Missing coordinate values for dim '{d}' in coords.")
# 3) Ensure Prior.dims exist in coords (for all top-level priors we manage)
def _to_tuple(maybe_dims):
if isinstance(maybe_dims, str):
return (maybe_dims,)
if isinstance(maybe_dims, list | tuple):
return tuple(maybe_dims)
else:
return tuple()
for prior_name, prior in self.model_config.items():
if not isinstance(prior, Prior):
continue
for d in _to_tuple(getattr(prior, "dims", None)):
if d not in self.coords:
raise ValueError(
f"Dim '{d}' declared in Prior '{prior_name}' must be present in coords."
)
# 4) Enforce that likelihood dims match class dims exactly
likelihood_prior = self.model_config["likelihood"]
likelihood_dims = _to_tuple(getattr(likelihood_prior, "dims", None))
if likelihood_dims and tuple(self.dims) != likelihood_dims:
raise ValueError(
"Likelihood Prior dims "
f"{likelihood_dims} must match class dims {tuple(self.dims)}. "
"When supplying a custom model_config, ensure likelihood.dims equals the 'dims' argument."
)
def _parents(self, node: str) -> list[str]:
"""Return the list of parent node names for the given DAG node."""
return list(self.graph.predecessors(node))
[docs]
def build(self) -> pm.Model:
"""Construct and return the PyMC model implied by the DAG and data.
The method creates a ``pm.Data`` container for every node to align the
observed data with the declared ``dims``. For each edge ``A -> B``, a
slope prior is instantiated from ``model_config['slope']`` and used in the
mean of node ``B``'s likelihood, which is instantiated from
``model_config['likelihood']``.
Returns
-------
pymc.Model
A fully specified model with slopes and likelihoods for all nodes.
Examples
--------
Build a model and sample from it:
.. code-block:: python
builder = BuildModelFromDAG(
dag="A->B", df=df, target="B", dims=("date",), coords={"date": dates}
)
model = builder.build()
with model:
idata = pm.sample(100, tune=100, chains=2, cores=2)
Multi-dimensional dims (e.g. date and country):
.. code-block:: python
dims = ("date", "country")
coords = {"date": dates, "country": ["Venezuela", "Colombia"]}
builder = BuildModelFromDAG(
dag="A->B, B->Y", df=df, target="Y", dims=dims, coords=coords
)
model = builder.build()
"""
dims = self.dims
coords = self.coords
with pm.Model(coords=coords) as model:
data_containers: dict[str, pm.Data] = {}
for node in self.nodes:
if node not in self.df.columns:
raise KeyError(f"Column '{node}' not found in df.")
# Ensure observed data has shape consistent with declared dims by pivoting via xarray
indexed = self.df.set_index(list(dims))
xarr = indexed.to_xarray()[node]
values = xarr.values
data_containers[node] = pm.Data(f"_{node}", values, dims=dims)
# For each node add slope priors per parent and likelihood with sigma prior
slope_rvs: dict[tuple[str, str], pt.TensorVariable] = {}
# Create priors in a stable deterministic order
for node in self.nodes:
parents = self._parents(node)
# Slopes for each parent -> node
mu_expr = 0
for parent in parents:
slope_name = f"{parent.lower()}:{node.lower()}"
slope_rv = self.model_config["slope"].create_variable(slope_name)
slope_rvs[(parent, node)] = slope_rv
mu_expr += slope_rv * data_containers[parent]
intercept_rv = self.model_config["intercept"].create_variable(
f"{node.lower()}_intercept"
)
self.model_config["likelihood"].create_likelihood_variable(
name=node,
mu=mu_expr + intercept_rv,
observed=data_containers[node],
)
self.model = model
return self.model
[docs]
def model_graph(self):
"""Return a Graphviz visualization of the built PyMC model.
Returns
-------
graphviz.Source
Graphviz object representing the model graph.
Examples
--------
.. code-block:: python
model = builder.build()
g = builder.model_graph()
g
"""
if not hasattr(self, "model"):
raise RuntimeError("Call build() first.")
return pm.model_to_graphviz(self.model)
[docs]
def dag_graph(self):
"""Return a copy of the parsed DAG as a NetworkX directed graph.
Returns
-------
networkx.DiGraph
A directed acyclic graph with the same nodes and edges as the input DAG.
Examples
--------
.. code-block:: python
g = builder.dag_graph()
list(g.edges())
"""
if nx is None:
raise ImportError(
"To use Causal Graph functionality, please install the optional dependencies with: "
"pip install pymc-marketing[dag]"
)
g = nx.DiGraph()
g.add_nodes_from(self.graph.nodes)
g.add_edges_from(self.graph.edges)
return g
[docs]
class TBFPC:
r"""
Target-first Bayes Factor PC (TBF-PC) causal discovery algorithm.
This algorithm is a target-oriented variant of the Peter–Clark (PC) algorithm,
using Bayes factors (via ΔBIC approximation) as the conditional independence test.
For each conditional independence test of the form
.. math::
H_0 : Y \perp X \mid S
\quad \text{vs.} \quad
H_1 : Y \not\!\perp X \mid S
we compare two linear models:
.. math::
M_0 : Y \sim S
\\
M_1 : Y \sim S + X
where :math:`S` is a conditioning set of variables.
The Bayesian Information Criterion (BIC) is defined as
.. math::
\mathrm{BIC}(M) = n \log\!\left(\frac{\mathrm{RSS}}{n}\right)
+ k \log(n),
with residual sum of squares :math:`\mathrm{RSS}`, sample size :math:`n`,
and number of parameters :math:`k`.
The Bayes factor is approximated by
.. math::
\log \mathrm{BF}_{10} \approx -\tfrac{1}{2}
\left[ \mathrm{BIC}(M_1) - \mathrm{BIC}(M_0) \right].
Independence is declared if :math:`\mathrm{BF}_{10} < \tau`,
where :math:`\tau` is set via the ``bf_thresh`` parameter.
Target Edge Rules
-----------------
Different rules govern how driver → target edges are retained:
- ``"any"``:
keep :math:`X \to Y` unless **any** conditioning set renders
:math:`X \perp Y \mid S`.
- ``"conservative"``:
keep :math:`X \to Y` if **at least one** conditioning set shows
dependence.
- ``"fullS"``:
test only with the **full set** of other drivers as :math:`S`.
Examples
--------
**1. Basic usage with full conditioning set**
.. code-block:: python
import numpy as np, pandas as pd
rng = np.random.default_rng(7)
n = 2000
C = rng.gamma(2,1,n)
A = 0.7*C + rng.gamma(2,1,n)
D = 0.5*C + rng.gamma(2,1,n)
B = 0.8*A + rng.gamma(2,1,n)
Y = 0.9*B + 0.6*D + 0.7*C + rng.gamma(2,1,n)
df = pd.DataFrame({"A":A,"B":B,"C":C,"D":D,"Y":Y})
df = (df - df.mean())/df.std() # recommended scaling
model = TBFPC(target="Y", target_edge_rule="fullS")
model.fit(df, drivers=["A","B","C","D"])
print(model.get_directed_edges())
print(model.get_undirected_edges())
print(model.to_digraph())
**2. Using forbidden edges**
You can specify edges that must *not* be tested or included
(prior knowledge about the domain).
.. code-block:: python
model = TBFPC(
target="Y",
target_edge_rule="any",
forbidden_edges=[("A","C")] # forbid A--C
)
model.fit(df, drivers=["A","B","C","D"])
print(model.to_digraph())
**3. Conservative rule**
Keeps driver → target edges if **any conditioning set**
shows dependence.
.. code-block:: python
model = TBFPC(target="Y", target_edge_rule="conservative")
model.fit(df, drivers=["A","B","C","D"])
print(model.to_digraph())
References
----------
- Spirtes, Glymour, Scheines (2000). *Causation, Prediction, and Search*. MIT Press. [PC algorithm]
- Spirtes & Glymour (1991). "An Algorithm for Fast Recovery of Sparse Causal Graphs."
- Kass, R. & Raftery, A. (1995). "Bayes Factors."
"""
[docs]
@validate_call(config=dict(arbitrary_types_allowed=True))
def __init__(
self,
target: Annotated[
str,
Field(
min_length=1,
description="Name of the outcome variable to orient the search.",
),
],
*,
target_edge_rule: Literal["any", "conservative", "fullS"] = "any",
bf_thresh: Annotated[float, Field(gt=0.0)] = 1.0,
forbidden_edges: Sequence[tuple[str, str]] | None = None,
required_edges: Sequence[tuple[str, str]] | None = None,
):
"""Create a new TBFPC causal discovery model.
Parameters
----------
target
Variable name for the model outcome; must be present in the data
used during fitting.
target_edge_rule
Rule that controls which driver → target edges are retained.
Options are ``"any"``, ``"conservative"``, and ``"fullS"``.
bf_thresh
Positive Bayes factor threshold applied during conditional
independence tests.
forbidden_edges
Optional sequence of node pairs that must not be connected in the
learned graph.
required_edges
Optional sequence of directed ``(u, v)`` pairs that must be present
in the learned graph as ``u -> v``.
"""
warnings.warn(
"TBFPC is experimental and its API may change; use with caution.",
UserWarning,
stacklevel=2,
)
self.target = target
self.target_edge_rule = target_edge_rule
self.bf_thresh = float(bf_thresh)
self.forbidden_edges: set[tuple[str, str]] = set(forbidden_edges or [])
self.required_edges: set[tuple[str, str]] = set(required_edges or [])
conflicts = [
(u, v)
for (u, v) in self.required_edges
if (u, v) in self.forbidden_edges or (v, u) in self.forbidden_edges
]
if conflicts:
conflict_str = ", ".join(f"{u}->{v}" for u, v in conflicts)
raise ValueError(
f"Required edges conflict with forbidden edges: {conflict_str}"
)
conflicts = [
(u, v)
for (u, v) in self.required_edges
if (u, v) in self.forbidden_edges or (v, u) in self.forbidden_edges
]
if conflicts:
conflict_str = ", ".join(f"{u}->{v}" for u, v in conflicts)
raise ValueError(
f"Required edges conflict with forbidden edges: {conflict_str}"
)
# Internal state
self.sep_sets: dict[tuple[str, str], set[str]] = {}
self._adj_directed: set[tuple[str, str]] = set()
self._adj_undirected: set[tuple[str, str]] = set()
self.nodes_: list[str] = []
self.test_results: dict[tuple[str, str, frozenset[str]], TestResult] = {}
# Shared response vector for symbolic BIC computation
# Initialized with placeholder; will be updated with actual data during fitting
self.y_sh = pytensor.shared(np.zeros(1, dtype="float64"), name="y_sh")
self._bic_fn = self._build_symbolic_bic_fn()
@staticmethod
def _bitmasks(k: int):
"""Yield tuples of 0/1 of length k (fast product without importing itertools)."""
# Equivalent to itertools.product([0,1], repeat=k) but minimal
if k == 0:
yield ()
return
stack = [0] * k
i = 0
while True:
if i < k:
stack[i] = 0
i += 1
continue
yield tuple(stack)
# increment like binary counter
i -= 1
while i >= 0 and stack[i] == 1:
i -= 1
if i < 0:
break
stack[i] = 1
i += 1
@staticmethod
def _parse_cpdag_dot(
dot: str,
) -> tuple[set[str], set[tuple[str, str]], set[tuple[str, str]]]:
"""Minimal DOT parser for a single CPDAG block."""
import re
digraph = re.search(r"digraph\b[^{}]*\{(.*?)\}", dot, flags=re.DOTALL)
if not digraph:
raise ValueError("No 'digraph { ... }' block found.")
body = digraph.group(1)
nodes: set[str] = set()
directed: set[tuple[str, str]] = set()
undirected: set[tuple[str, str]] = set()
node_re = re.compile(r'^\s*"([^"]+)"\s*(?:\[[^\]]*\])?\s*;\s*$')
edge_re = re.compile(r'^\s*"([^"]+)"\s*->\s*"([^"]+)"\s*(\[[^\]]*\])?\s*;\s*$')
def is_undirected(attrs: str | None) -> bool:
if not attrs:
return False
low = attrs.lower()
return "style=dashed" in low and "dir=none" in low
for raw in body.splitlines():
line = raw.strip()
if not line or line.startswith("//"):
continue
# node?
m = node_re.match(line)
if m:
nodes.add(m.group(1))
continue
# edge?
m = edge_re.match(line)
if m:
u, v, attrs = m.group(1), m.group(2), m.group(3)
nodes.update((u, v))
if is_undirected(attrs):
undirected.add((u, v) if u <= v else (v, u))
else:
directed.add((u, v))
continue
# ignore other lines (global styles etc.)
return nodes, directed, undirected
@staticmethod
def _is_acyclic(
nodes: set[str], edges: list[tuple[str, str]] | set[tuple[str, str]]
) -> bool:
"""DFS cycle check."""
adj: dict[str, list[str]] = {u: [] for u in nodes}
for u, v in edges:
adj.setdefault(u, []).append(v)
adj.setdefault(v, []) # ensure key exists
state = {u: 0 for u in nodes} # 0=unseen, 1=visiting, 2=done
def dfs(u: str) -> bool:
state[u] = 1
for w in adj[u]:
if state[w] == 1:
return False
if state[w] == 0 and not dfs(w):
return False
state[u] = 2
return True
return all(state[u] or dfs(u) for u in nodes)
def _dot_from_edges(
self, nodes: set[str], edges: list[tuple[str, str]] | set[tuple[str, str]]
) -> str:
"""Render a fully directed graph to DOT; highlights target if present."""
lines = ["digraph G {", " node [shape=ellipse];"]
for n in sorted(nodes):
if hasattr(self, "target") and n == self.target:
lines.append(f' "{n}" [style=filled, fillcolor="#eef5ff"];')
else:
lines.append(f' "{n}";')
for u, v in sorted(edges):
lines.append(f' "{u}" -> "{v}";')
lines.append("}")
return "\n".join(lines).replace("\\n'", "\\n").replace("'\\n", "\\n") # hygiene
def _key(self, u: str, v: str) -> tuple[str, str]:
"""Return a sorted 2-tuple key for an undirected edge between ``u`` and ``v``."""
return (u, v) if u <= v else (v, u)
def _set_sep(self, u: str, v: str, S: Sequence[str]) -> None:
"""Record the separation set ``S`` for the node pair ``(u, v)``."""
self.sep_sets[self._key(u, v)] = set(S)
def _has_forbidden(self, u: str, v: str) -> bool:
"""Return True if edge ``u—v`` is forbidden in either direction."""
return (u, v) in self.forbidden_edges or (v, u) in self.forbidden_edges
def _is_required(self, u: str, v: str) -> bool:
"""Return True if the directed edge ``u -> v`` is required."""
return (u, v) in self.required_edges
def _add_directed(self, u: str, v: str) -> None:
"""Add a directed edge ``u -> v`` if not forbidden; drop undirected if present."""
if not self._has_forbidden(u, v):
self._adj_undirected.discard(self._key(u, v))
self._adj_directed.add((u, v))
def _add_undirected(self, u: str, v: str) -> None:
"""Add an undirected edge ``u -- v`` if allowed and not already directed."""
if (
not self._has_forbidden(u, v)
and (u, v) not in self._adj_directed
and (v, u) not in self._adj_directed
and not self._is_required(u, v)
and not self._is_required(v, u)
):
self._adj_undirected.add(self._key(u, v))
def _remove_all(self, u: str, v: str) -> None:
"""Remove any edge (directed or undirected) between ``u`` and ``v``."""
if self._is_required(u, v) or self._is_required(v, u):
return
self._adj_undirected.discard(self._key(u, v))
self._adj_directed.discard((u, v))
self._adj_directed.discard((v, u))
def _enforce_required_edges(self) -> None:
"""Force required edges to appear as directed adjacencies."""
for u, v in self.required_edges:
self._adj_undirected.discard(self._key(u, v))
self._adj_directed.discard((v, u))
self._adj_directed.add((u, v))
self.test_results[(u, v, EMPTY_CONDITION_SET)] = {
"bic0": float("nan"),
"bic1": float("nan"),
"delta_bic": float("nan"),
"logBF10": float("nan"),
"BF10": float("nan"),
"independent": False,
"conditioning_set": [],
"forced": True,
}
def _validate_required_nodes(self, drivers: Sequence[str]) -> None:
"""Ensure required edges reference known nodes."""
allowed = set(drivers) | {self.target}
missing: set[str] = set()
for u, v in self.required_edges:
if u not in allowed:
missing.add(u)
if v not in allowed:
missing.add(v)
if missing:
raise ValueError(
"Required edges reference unknown nodes: " + ", ".join(sorted(missing))
)
def _build_symbolic_bic_fn(self):
"""Build a BIC callable using a fast solver with a pseudoinverse fallback."""
X = pt.matrix("X")
n = pt.iscalar("n")
xtx = pt.dot(X.T, X)
xty = pt.dot(X.T, self.y_sh)
beta_solve = pt.linalg.solve(xtx, xty)
resid_solve = self.y_sh - pt.dot(X, beta_solve)
rss_solve = pt.sum(resid_solve**2)
beta_pinv = pt.nlinalg.pinv(X) @ self.y_sh
resid_pinv = self.y_sh - pt.dot(X, beta_pinv)
rss_pinv = pt.sum(resid_pinv**2)
k = X.shape[1]
nf = pt.cast(n, "float64")
rss_solve_safe = pt.maximum(rss_solve, np.finfo("float64").tiny)
rss_pinv_safe = pt.maximum(rss_pinv, np.finfo("float64").tiny)
bic_solve = nf * pt.log(rss_solve_safe / nf) + k * pt.log(nf)
bic_pinv = nf * pt.log(rss_pinv_safe / nf) + k * pt.log(nf)
bic_solve_fn = pytensor.function(
[X, n], [bic_solve, rss_solve], on_unused_input="ignore", mode="FAST_RUN"
)
bic_pinv_fn = pytensor.function(
[X, n], bic_pinv, on_unused_input="ignore", mode="FAST_RUN"
)
def bic_fn(X_val: np.ndarray, n_val: int) -> float:
try:
bic_value, rss_value = bic_solve_fn(X_val, n_val)
if np.isfinite(rss_value) and rss_value > np.finfo("float64").tiny:
return float(bic_value)
except (np.linalg.LinAlgError, RuntimeError, ValueError):
pass
return float(bic_pinv_fn(X_val, n_val))
return bic_fn
def _ci_independent(
self, df: pd.DataFrame, x: str, y: str, cond: Sequence[str]
) -> bool:
"""Return True if ΔBIC indicates independence of ``x`` and ``y`` given ``cond``."""
if self._has_forbidden(x, y):
return True
if self._is_required(x, y) or self._is_required(y, x):
self.test_results[(x, y, frozenset(cond))] = TestResult(
bic0=float("nan"),
bic1=float("nan"),
delta_bic=float("nan"),
logBF10=float("nan"),
BF10=float("nan"),
independent=False,
conditioning_set=list(cond),
forced=True,
)
return False
n = len(df)
self.y_sh.set_value(df[y].to_numpy().astype("float64"))
if len(cond) == 0:
X0 = np.ones((n, 1))
else:
X0 = np.column_stack([np.ones(n), df[list(cond)].to_numpy()])
X1 = np.column_stack([X0, df[x].to_numpy()])
bic0 = float(self._bic_fn(X0, n))
bic1 = float(self._bic_fn(X1, n))
delta_bic = bic1 - bic0
logBF10 = -0.5 * delta_bic
BF10 = np.exp(logBF10)
independence = BF10 < self.bf_thresh
result: TestResult = {
"bic0": bic0,
"bic1": bic1,
"delta_bic": delta_bic,
"logBF10": logBF10,
"BF10": BF10,
"independent": independence,
"conditioning_set": list(cond),
}
self.test_results[(x, y, frozenset(cond))] = result
return independence
def _test_target_edges(self, df: pd.DataFrame, drivers: Sequence[str]) -> None:
"""Phase 1: test driver→target edges according to ``target_edge_rule``."""
for xi in drivers:
neighbor_sets = [d for d in drivers if d != xi]
max_k = min(3, len(neighbor_sets))
all_sets = [
tuple(S)
for k in range(max_k + 1)
for S in it.combinations(neighbor_sets, k)
]
if self.target_edge_rule == "any":
keep = True
for S in all_sets:
if self._ci_independent(df, xi, self.target, S):
self._set_sep(xi, self.target, S)
keep = False
break
if keep:
self._add_directed(xi, self.target)
else:
self._remove_all(xi, self.target)
elif self.target_edge_rule == "conservative":
indep_all = True
for S in all_sets:
if not self._ci_independent(df, xi, self.target, S):
indep_all = False
else:
self._set_sep(xi, self.target, S)
if indep_all:
self._remove_all(xi, self.target)
else:
self._add_directed(xi, self.target)
elif self.target_edge_rule == "fullS":
S = tuple(neighbor_sets)
if self._ci_independent(df, xi, self.target, S):
self._set_sep(xi, self.target, S)
self._remove_all(xi, self.target)
else:
self._add_directed(xi, self.target)
def _test_driver_skeleton(self, df: pd.DataFrame, drivers: Sequence[str]) -> None:
"""Phase 2: build the undirected driver skeleton via pairwise CI tests."""
for xi, xj in it.combinations(drivers, 2):
others = [d for d in drivers if d not in (xi, xj)]
max_k = min(3, len(others))
dependent = True
sep_rec = False
for k in range(max_k + 1):
for S in it.combinations(others, k):
if self._ci_independent(df, xi, xj, S):
self._set_sep(xi, xj, S)
dependent = False
sep_rec = True
break
if sep_rec:
break
if dependent:
self._add_undirected(xi, xj)
else:
self._remove_all(xi, xj)
[docs]
def fit(self, df: pd.DataFrame, drivers: Sequence[str]):
"""Fit the TBFPC procedure to the supplied dataframe.
Parameters
----------
df : pandas.DataFrame
Dataset containing the target column and every candidate driver.
drivers : Sequence[str]
Iterable of column names to treat as potential drivers of the
target.
Returns
-------
TBFPC
The fitted instance (``self``) with internal adjacency structures
populated.
Examples
--------
.. code-block:: python
model = TBFPC(target="Y", target_edge_rule="fullS")
model.fit(df, drivers=["A", "B", "C"])
"""
self._validate_required_nodes(drivers)
self.sep_sets.clear()
self._adj_directed.clear()
self._adj_undirected.clear()
self.test_results.clear()
self._enforce_required_edges()
self._test_target_edges(df, drivers)
self._test_driver_skeleton(df, drivers)
self._enforce_required_edges()
self.nodes_ = [*list(drivers), self.target]
return self
[docs]
def get_directed_edges(self) -> list[tuple[str, str]]:
"""Return directed edges learned by the algorithm.
Returns
-------
list[tuple[str, str]]
Sorted list of ``(u, v)`` pairs representing oriented edges.
Examples
--------
.. code-block:: python
directed = model.get_directed_edges()
"""
return sorted(self._adj_directed)
[docs]
def get_undirected_edges(self) -> list[tuple[str, str]]:
"""Return undirected edges remaining after orientation.
Returns
-------
list[tuple[str, str]]
Sorted list of ``(u, v)`` pairs for unresolved adjacencies.
Examples
--------
.. code-block:: python
skeleton = model.get_undirected_edges()
"""
return sorted(self._adj_undirected)
[docs]
def get_test_results(self, x: str, y: str) -> list[TestResult]:
"""Return ΔBIC diagnostics for the unordered pair ``(x, y)``.
Parameters
----------
x : str
Name of the first variable in the pair.
y : str
Name of the second variable in the pair.
Returns
-------
list[dict[str, float]]
Each dictionary contains ``bic0``, ``bic1``, ``delta_bic``,
``logBF10``, ``BF10``, and the conditioning set used during the
test.
Examples
--------
.. code-block:: python
stats = model.get_test_results("A", "Y")
"""
return [v for (xi, yi, _), v in self.test_results.items() if {xi, yi} == {x, y}]
[docs]
def summary(self) -> str:
"""Render a text summary of the learned graph and test count.
Returns
-------
str
Multiline string describing directed edges, undirected edges, and
the number of conditional independence tests executed.
Examples
--------
.. code-block:: python
print(model.summary())
"""
lines = ["=== Directed edges ==="]
for u, v in self.get_directed_edges():
suffix = " [required]" if self._is_required(u, v) else ""
lines.append(f"{u} -> {v}{suffix}")
lines.append("=== Undirected edges ===")
for u, v in self.get_undirected_edges():
lines.append(f"{u} -- {v}")
lines.append("=== Number of CI tests run ===")
lines.append(str(len(self.test_results)))
return "\n".join(lines)
[docs]
def to_digraph(self) -> str:
"""Return the learned graph encoded in DOT format.
Returns
-------
str
DOT string compatible with Graphviz rendering utilities.
Examples
--------
.. code-block:: python
dot_str = model.to_digraph()
"""
lines = ["digraph G {", " node [shape=ellipse];"]
for n in self.nodes_:
if n == self.target:
lines.append(f' "{n}" [style=filled, fillcolor="#eef5ff"];')
else:
lines.append(f' "{n}";')
for u, v in self.get_directed_edges():
attrs = " [color=darkgreen, penwidth=2]" if self._is_required(u, v) else ""
lines.append(f' "{u}" -> "{v}"{attrs};')
for u, v in self.get_undirected_edges():
lines.append(f' "{u}" -> "{v}" [style=dashed, dir=none];')
lines.append("}")
return "\n".join(lines)
[docs]
def get_all_cdags_from_cpdag(self, dot_cpdag: str | None = None) -> list[str]:
"""
Enumerate all acyclic orientations (consistent extensions) of the CPDAG.
Parameters
----------
dot_cpdag : str | None
If provided, parse the CPDAG from this DOT string (expects undirected
edges encoded as `[style=dashed, dir=none]`). If None, use the model's
current CPDAG from `self.get_directed_edges()` and `self.get_undirected_edges()`.
Returns
-------
list[str]
A list of DOT strings, each representing a fully oriented DAG (no dashed edges).
"""
nodes, fixed_dir, undirected = (
self._parse_cpdag_dot(dot_cpdag)
if dot_cpdag is not None
else (
set(self.nodes_),
set(self.get_directed_edges()),
set(self.get_undirected_edges()),
)
)
if not undirected:
# Already a DAG: validate acyclicity and return it
edges = sorted(fixed_dir)
if self._is_acyclic(nodes, edges):
return [self._dot_from_edges(nodes, edges)]
return []
cdags: list[str] = []
und = sorted({self._key(u, v) for (u, v) in undirected}) # canonical ordering
for mask in self._bitmasks(len(und)):
oriented = list(fixed_dir)
# bit 0 -> u->v, bit 1 -> v->u
oriented.extend(
(u, v) if b == 0 else (v, u)
for b, (u, v) in zip(mask, und, strict=False)
)
if self._is_acyclic(nodes, oriented):
cdags.append(self._dot_from_edges(nodes, oriented))
return cdags
[docs]
class CausalGraphModel:
"""Represent a causal model based on a Directed Acyclic Graph (DAG).
Provides methods to analyze causal relationships and determine the minimal adjustment set
for backdoor adjustment between treatment and outcome variables.
Parameters
----------
causal_model : CausalModel
An instance of dowhy's CausalModel, representing the causal graph and its relationships.
treatment : list[str]
A list of treatment variable names.
outcome : str
The outcome variable name.
References
----------
.. [1] https://github.com/microsoft/dowhy
"""
[docs]
def __init__(
self, causal_model: CausalModel, treatment: list[str] | tuple[str], outcome: str
) -> None:
self.causal_model = causal_model
self.treatment = treatment
self.outcome = outcome
[docs]
@classmethod
def build_graphical_model(
cls, graph: str, treatment: list[str] | tuple[str], outcome: str
) -> CausalGraphModel:
"""Create a CausalGraphModel from a string representation of a graph.
Parameters
----------
graph : str
A string representation of the graph (e.g., String in DOT format).
treatment : list[str]
A list of treatment variable names.
outcome : str
The outcome variable name.
Returns
-------
CausalGraphModel
An instance of CausalGraphModel constructed from the given graph string.
"""
causal_model = CausalModel(
data=pd.DataFrame(), graph=graph, treatment=treatment, outcome=outcome
)
return cls(causal_model, treatment, outcome)
[docs]
def get_backdoor_paths(self) -> list[list[str]]:
"""Find all backdoor paths between the combined treatment and outcome variables.
Returns
-------
list[list[str]]
A list of backdoor paths, where each path is represented as a list of variable names.
References
----------
.. [1] Causal Inference in Statistics: A Primer
By Judea Pearl, Madelyn Glymour, Nicholas P. Jewell · 2016
"""
# Use DoWhy's internal method to get backdoor paths for all treatments combined
return self.causal_model._graph.get_backdoor_paths(
nodes1=self.treatment, nodes2=[self.outcome]
)
[docs]
def get_unique_adjustment_nodes(self) -> list[str]:
"""Compute the minimal adjustment set required for backdoor adjustment across all treatments.
Returns
-------
list[str]
A list of unique adjustment variables needed to block all backdoor paths.
"""
paths = self.get_backdoor_paths()
# Flatten paths and exclude treatments and outcome from adjustment set
adjustment_nodes = set(
node
for path in paths
for node in path
if node not in self.treatment and node != self.outcome
)
return list(adjustment_nodes)
[docs]
def compute_adjustment_sets(
self,
channel_columns: list[str] | tuple[str],
control_columns: list[str] | None = None,
) -> list[str] | None:
"""Compute minimal adjustment sets and handle warnings."""
channel_columns = list(channel_columns)
if control_columns is None:
return control_columns
self.adjustment_set = self.get_unique_adjustment_nodes()
common_controls = set(control_columns).intersection(self.adjustment_set)
unique_controls = set(control_columns) - set(self.adjustment_set)
if unique_controls:
warnings.warn(
f"Columns {unique_controls} are not in the adjustment set. Controls are being modified.",
stacklevel=2,
)
control_columns = list(common_controls - set(channel_columns))
self.minimal_adjustment_set = control_columns + list(channel_columns)
for column in self.adjustment_set:
if column not in control_columns and column not in channel_columns:
warnings.warn(
f"""Column {column} in adjustment set not found in data.
Not controlling for this may induce bias in treatment effect estimates.""",
stacklevel=2,
)
return control_columns