polis-community · patcon · Apr 6, 2025 · Mar 17, 2025 · Mar 18, 2025 · Mar 19, 2025
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,9 @@ uv.lock
 
 test_cache.sqlite
 
+# Python notebook testins
+/docs/notebooks/polis-demo-export/
+
 # MkDocs build directory
 site/
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,16 +4,20 @@
 
 ### Fixes
 - Allow `is_strict_moderation` to be inferred from not just API data, but file data.
+- Better handle numpy divide-by-zero edge-cases in two-property test. ([#28](https://github.com/polis-community/red-dwarf/pull/28))
 
 ### Changes
 - Fixed participant projections to map more closely to Polis with `utils.pca.sparsity_aware_project_ptpt()`.
 - Add simple Polis implementation in `reddwarf.implementations.polis`.
 - Add singular `polis_id` arg as recommended way to download (auto-detect `report_id` vs `converation_id`).
+- Calculate group-aware consensus stats. ([#28](https://github.com/polis-community/red-dwarf/pull/28))
 
 ### Chores
 - Moved agora implementation from `reddwarf.agora` to `reddwarf.implementations.agora` (deprecation warning).
 - Add missing `conversation.json` fixture file.
 - Extract statement processing from polis class-based client to pure util function.
+- Add types to fully describe polismath object. ([#28](https://github.com/polis-community/red-dwarf/pull/28))
+- Add new fixture for large convo without meta statements. ([#28](https://github.com/polis-community/red-dwarf/pull/28))
 
 ## [0.2.0][] (2025-03-24)
 ### Fixed

diff --git a/debug.py b/debug.py
@@ -66,14 +66,16 @@
         client.find_optimal_k()  # Find optimal number of clusters
         cluster_labels = client.optimal_cluster_labels
 
-    grouped_stats_df = utils.calculate_comment_statistics_by_group(
+    grouped_stats_df, gac_df = utils.calculate_comment_statistics_dataframes(
         vote_matrix=vote_matrix,
         cluster_labels=cluster_labels, # type:ignore
     )
     polis_repness = utils.select_representative_statements(grouped_stats_df=grouped_stats_df)
-
     print(json.dumps(polis_repness, indent=2))
 
+    # Group-aware consensus
+    print(gac_df)
+
     presenter = DataPresenter(client=client)
     presenter.render_optimal_cluster_figure()
 

diff --git a/reddwarf/types/polis.py b/reddwarf/types/polis.py
@@ -1,48 +1,120 @@
 from numpy import integer as npInt
-from typing import TypeAlias, Literal
+from typing import TypeAlias, Literal, Optional
 from typing_extensions import NotRequired, TypedDict
 
+
+UnixTimestampMillisec: TypeAlias = int
 IncrementingId: TypeAlias = int
 BaseClusterId: TypeAlias = IncrementingId
 GroupId: TypeAlias = IncrementingId
 ParticipantId: TypeAlias = IncrementingId
+StatementId: TypeAlias = IncrementingId
+
+PolisCommentPriorities: TypeAlias = dict[str, float] # str[StatementId]
+PolisUserVoteCounts: TypeAlias = dict[str, int] # str[ParticipantId]
 
 class PolisGroupCluster(TypedDict):
     id: GroupId
     members: list[BaseClusterId]
-    center: list[float]
+    center: tuple[float, float]
 
+# Custom type
 class PolisGroupClusterExpanded(TypedDict):
     id: GroupId
     members: list[ParticipantId]
-    center: list[float]
+    center: tuple[float, float]
+
+BaseClusterMembership: TypeAlias = list[ParticipantId]
 
 class PolisBaseClusters(TypedDict):
     # Each outer list will be the same length, and will be 100 items or less.
     id: list[BaseClusterId]
-    members: list[list[ParticipantId]]
+    members: list[BaseClusterMembership]
     x: list[float]
     y: list[float]
     count: list[int]
 
-# Use functional form when attributes have hyphens or are string numbers.
-PolisRepnessStatement = TypedDict("PolisRepnessStatement", {
-    "tid": int,
-    "n-success": int,
-    "n-trials": int,
-    "p-success": float,
-    "p-test": float,
-    "repness": float,
-    "repness-test": float,
-    "repful-for": Literal["agree", "disagree"],
-    "best-agree": NotRequired[bool],
-    "n-agree": NotRequired[int],
-})
-
-PolisRepness = TypedDict("PolisRepness", {
-    "0": list[PolisRepnessStatement],
-    "1": list[PolisRepnessStatement],
-    "2": NotRequired[list[PolisRepnessStatement]],
-    "3": NotRequired[list[PolisRepnessStatement]],
-    "4": NotRequired[list[PolisRepnessStatement]],
-})
+class PolisRepnessStatement(TypedDict):
+    tid: int
+    n_success: int
+    n_trials: int
+    p_success: float
+    p_test: float
+    repness: float
+    repness_test: float
+    repful_for: Literal["agree", "disagree"]
+    best_agree: NotRequired[bool]
+    n_agree: NotRequired[int]
+
+PolisRepness: TypeAlias = dict[str, list[PolisRepnessStatement]] # str[GroupId]
+
+PerBaseVoteCounts: TypeAlias = list[int]
+
+class PolisBaseClusterVoteSummary(TypedDict):
+    A: PerBaseVoteCounts
+    D: PerBaseVoteCounts
+    S: PerBaseVoteCounts
+
+class PolisPCA(TypedDict):
+    # Each outer list will be the same length, one item for each statement.
+    center: list[float]
+    comps: tuple[list[float], list[float]]
+    comment_projection: tuple[list[float], list[float]]
+    comment_extremity: list[float]
+
+class PolisConsensusStatement(TypedDict):
+    tid: StatementId
+    n_success: int
+    n_trials: int
+    p_success: float
+    p_test: float
+
+class PolisConsensus(TypedDict):
+    agree: list[PolisConsensusStatement]
+    disagree: list[PolisConsensusStatement]
+
+class PolisStatementVoteSummary(TypedDict):
+    A: int
+    D: int
+    S: int
+
+class PolisGroupVote(TypedDict):
+    n_members: int
+    votes: dict[str, PolisStatementVoteSummary] # str[StatementId]
+    id: GroupId
+
+PolisGroupVotes: TypeAlias = dict[str, PolisGroupVote] # str[GroupId]
+
+class PolisMath(TypedDict):
+    tids: list[StatementId]
+    meta_tids: list[StatementId]
+    mod_in: list[StatementId]
+    mod_out: list[StatementId]
+
+    in_conv: list[ParticipantId]
+    user_vote_counts: PolisUserVoteCounts
+
+    # For drawing graph
+    pca: PolisPCA
+    group_clusters: list[PolisGroupCluster]
+
+    # Consensus statements
+    consensus: PolisConsensus
+
+    # Group statements (representative)
+    repness: PolisRepness
+    group_votes: PolisGroupVotes
+
+    # Overall statements
+    comment_priorities: PolisCommentPriorities
+    group_aware_consensus: dict[str, float] # str[StatementId]
+
+    # Base clusters
+    base_clusters: PolisBaseClusters
+    votes_base: dict[str, PolisBaseClusterVoteSummary] # str[StatementId]
+
+    n: int
+    n_cmts: int
+    lastModTimestamp: None
+    lastVoteTimestamp: UnixTimestampMillisec
+    math_tick: int
diff --git a/reddwarf/utils/pca.py b/reddwarf/utils/pca.py
@@ -1,3 +1,4 @@
+from numpy.typing import ArrayLike
 import pandas as pd
 import numpy as np
 from sklearn.decomposition import PCA
@@ -86,29 +87,97 @@ def scale_projected_data(
 
     return projected_data * participant_scaling_coeffs
 
-def sparsity_aware_project_ptpt(votes, comps, center):
+# TODO: Clean up variables and docs.
+def sparsity_aware_project_ptpt(participant_votes, statement_components, statement_means):
     """
     Projects a sparse vote vector into PCA space while adjusting for sparsity.
+
+    Args:
+        participant_votes (list): List of participant votes on each statement
+        statement_components (list[list[float]]): Two lists of floats corresponding to the two principal components
+        statement_means (list[float]): List of floats corresponding to the centers/means of each statement
+
+    Returns:
+        projected_coords (list[list[float]]): Two lists corresponding to projected xy coordinates.
     """
-    comps = np.array(comps)  # Shape: (2, n_features)
-    center = np.array(center)  # Shape: (n_features,)
+    statement_components = np.array(statement_components)  # Shape: (2, n_features)
+    statement_means = np.array(statement_means)  # Shape: (n_features,)
 
     # TODO: This included zerod out (moderated) statements. Should it?
-    n_cmnts = len(votes)
+    n_statements = len(participant_votes)
 
-    ptpt_votes = np.array(votes)
-    statement_mask = ~np.isnan(votes)  # Only consider non-null values
+    participant_votes = np.array(participant_votes)
+    mask = ~np.isnan(participant_votes)  # Only consider non-null values
 
     # Extract relevant values
-    x_vals = ptpt_votes[statement_mask] - center[statement_mask]  # Centered values
+    x_vals = participant_votes[mask] - statement_means[mask]  # Centered values
     # TODO: Extend this to work in 3D
-    pc1_vals, pc2_vals = comps[:, statement_mask]  # Select only used components
+    pc1_vals, pc2_vals = statement_components[:, mask]  # Select only used components
 
     # Compute dot product projection
     p1 = np.dot(x_vals, pc1_vals)
     p2 = np.dot(x_vals, pc2_vals)
 
-    n_votes = np.count_nonzero(statement_mask)  # Non-null votes count
-    scale = np.sqrt(n_cmnts / max(n_votes, 1))
+    # Non-null votes count
+    n_votes = np.count_nonzero(mask)
+    scale = np.sqrt(n_statements / max(n_votes, 1))
+
+    projected_coord = scale * np.array([p1, p2])
+
+    return projected_coord
+
+# TODO: Clean up variables and docs.
+def sparsity_aware_project_ptpts(vote_matrix, statement_components, statement_means):
+    """
+    Apply sparsity-aware projection to multiple vote vectors.
+    """
+    return np.array([
+        sparsity_aware_project_ptpt(participant_votes, statement_components, statement_means)
+        for participant_votes in vote_matrix]
+    )
+
+# TODO: Clean up variables and docs.
+def pca_project_cmnts(statement_components, statement_means):
+    """
+    Projects unit vectors for each feature into PCA space to understand their placement.
+    """
+    n_statements = len(statement_means)
+    # Create a matrix of virtual participants that each vote once on a single statement.
+    virtual_vote_matrix = np.full(shape=[n_statements, n_statements], fill_value=np.nan)
+    for i in range(n_statements):
+        # TODO: Why does Polis use -1 (disagree) here? is it the same? BUG?
+        virtual_vote_matrix[i][i] = -1  # Create unit vector representation
+
+    # 40 xy pairs. shape (40, 2)
+    statement_projections = sparsity_aware_project_ptpts(
+        virtual_vote_matrix,
+        statement_components,
+        statement_means,
+    )
+
+    return statement_projections
+
+def calculate_extremity(projections: ArrayLike):
+    # Compute extremity as vector magnitude on rows.
+    # vector magnitude = Euclidean norm = hypotenuse of xy
+    return np.linalg.norm(projections, axis=0)
+
+# TODO: Clean up variables and docs.
+def with_proj_and_extremity(pca):
+    """
+    Compute projection and extremity, then merge into PCA results.
+    """
+    statement_projections = pca_project_cmnts(
+        statement_components=pca["comps"],
+        statement_means=pca["center"],
+    )
+    # Flip the axes to get all x together and y together.
+    # 2 sets of 40. shape (2, 40)
+    statement_projections = statement_projections.transpose()
+
+    statement_extremities = calculate_extremity(statement_projections)
+
+    pca["comment-projection"] = statement_projections.tolist()
+    pca["comment-extremity"] = statement_extremities.tolist()
 
-    return scale * np.array([p1, p2])
+    return pca
diff --git a/reddwarf/utils/statements.py b/reddwarf/utils/statements.py
@@ -1,12 +1,28 @@
 import pandas as pd
 from reddwarf.models import ModeratedEnum
 
-def process_statements(statement_data=[]):
+def process_statements(
+    statement_data: list[dict] = [],
+) -> tuple[pd.DataFrame, list, list, list]:
+    """
+    Process raw statement data into a dataframe, and various lists of participant IDs.
+
+    Args:
+        statement_data (list[dict]): raw list of statement data dicts
+
+    Returns:
+        statements_df (pd.DataFrame): Dataframe of statements
+        mod_in_statement_ids (list): List of statement IDs to moderate in
+        mod_out_statement_ids (list): List of statement IDs to moderate out
+        meta_statement_ids (list): List of meta statement IDs
+
+    """
     mod_in_statement_ids = []
     mod_out_statement_ids = []
     meta_statement_ids = []
 
     statements_df = (pd.DataFrame
+        # TODO: See if both "moderated" and "mod" can end up in here. BUG?
         .from_records(statement_data)
         .set_index('statement_id')
         .sort_index()