Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
0cf582f
Add group-aware consensus calculation.
patcon Mar 17, 2025
7acc577
Add and document importance_metric() and priority_metric() for commen…
patcon Mar 18, 2025
ac10052
Add unit tests for priority_metric().
patcon Mar 19, 2025
87ed52f
Add bugfix and test case for arrays in priority_metric(). Add docs.
patcon Mar 19, 2025
1e8caa6
Allow pseudo-count to be a numpy array.
patcon Mar 20, 2025
ef8bf96
Added unit tests for importance_metric().
patcon Mar 20, 2025
89ebfbc
Add arg for pseudo-count into priority_metric().
patcon Mar 20, 2025
2b9d845
Calculate utils.stats.two_prop_test() more effectively, and suppress …
patcon Mar 20, 2025
733b235
Create comment projection functions. Add unit tests.
patcon Mar 20, 2025
b39dc1a
Clean up variable names in comment projection functions.
patcon Mar 20, 2025
5ba7356
Stop passing full pca object during comment projection.
patcon Mar 20, 2025
265fcf9
Rename more variables.
patcon Mar 20, 2025
7101f85
Move pca unit tests.
patcon Mar 21, 2025
f836796
Explain transpose a bit better.
patcon Mar 21, 2025
756b6d8
Add utils.run_pca() unit tests to compare against polismath results.
patcon Mar 21, 2025
39ef047
Test comment projectio and extremity calculations against more fixtures.
patcon Mar 22, 2025
664c361
Convert multiple pca tests into parametrized test.
patcon Mar 22, 2025
4ea598f
Add a test for data integrity issues in data from Polis API.
patcon Mar 22, 2025
c942dd7
Oops. Add the new fixture data.
patcon Mar 22, 2025
2a2124e
Standardize names of fixtures, and notate bad fixture.
patcon Mar 22, 2025
6847427
Merge branch 'main' into more-comment-stats
patcon Mar 23, 2025
aa42607
Merge branch 'main' into more-comment-stats
patcon Mar 31, 2025
99bd83c
Ignore data dump export notebook test artifact.
patcon Mar 31, 2025
7ba0b00
Clean up comments.
patcon Mar 31, 2025
9859402
Add unit tests for importance and priority metric with no votes.
patcon Mar 31, 2025
8d88bd1
Move test helper into own file.
patcon Mar 31, 2025
2bac3e6
Added conversation.json data for medium fixture.
patcon Mar 31, 2025
89fee07
Document process_statments() util function.
patcon Mar 31, 2025
314c3e1
Get medium-no-meta working for comment stats unit tests.
patcon Mar 31, 2025
b7a706e
Small unit test improvements.
patcon Mar 31, 2025
43bb4ca
Add unit test for group-aware-consensus.
patcon Mar 31, 2025
dd1e6af
Add types for full polismath object.
patcon Apr 1, 2025
228f449
Merge branch 'main' into more-comment-stats
patcon Apr 6, 2025
b97fc21
Update CHANGLOG.
patcon Apr 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ uv.lock

test_cache.sqlite

# Python notebook testins
/docs/notebooks/polis-demo-export/

# MkDocs build directory
site/

Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,20 @@

### Fixes
- Allow `is_strict_moderation` to be inferred from not just API data, but file data.
- Better handle numpy divide-by-zero edge-cases in two-property test. ([#28](https://github.com/polis-community/red-dwarf/pull/28))

### Changes
- Fixed participant projections to map more closely to Polis with `utils.pca.sparsity_aware_project_ptpt()`.
- Add simple Polis implementation in `reddwarf.implementations.polis`.
- Add singular `polis_id` arg as recommended way to download (auto-detect `report_id` vs `converation_id`).
- Calculate group-aware consensus stats. ([#28](https://github.com/polis-community/red-dwarf/pull/28))

### Chores
- Moved agora implementation from `reddwarf.agora` to `reddwarf.implementations.agora` (deprecation warning).
- Add missing `conversation.json` fixture file.
- Extract statement processing from polis class-based client to pure util function.
- Add types to fully describe polismath object. ([#28](https://github.com/polis-community/red-dwarf/pull/28))
- Add new fixture for large convo without meta statements. ([#28](https://github.com/polis-community/red-dwarf/pull/28))

## [0.2.0][] (2025-03-24)
### Fixed
Expand Down
6 changes: 4 additions & 2 deletions debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,16 @@
client.find_optimal_k() # Find optimal number of clusters
cluster_labels = client.optimal_cluster_labels

grouped_stats_df = utils.calculate_comment_statistics_by_group(
grouped_stats_df, gac_df = utils.calculate_comment_statistics_dataframes(
vote_matrix=vote_matrix,
cluster_labels=cluster_labels, # type:ignore
)
polis_repness = utils.select_representative_statements(grouped_stats_df=grouped_stats_df)

print(json.dumps(polis_repness, indent=2))

# Group-aware consensus
print(gac_df)

presenter = DataPresenter(client=client)
presenter.render_optimal_cluster_figure()

Expand Down
122 changes: 97 additions & 25 deletions reddwarf/types/polis.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,120 @@
from numpy import integer as npInt
from typing import TypeAlias, Literal
from typing import TypeAlias, Literal, Optional
from typing_extensions import NotRequired, TypedDict


UnixTimestampMillisec: TypeAlias = int
IncrementingId: TypeAlias = int
BaseClusterId: TypeAlias = IncrementingId
GroupId: TypeAlias = IncrementingId
ParticipantId: TypeAlias = IncrementingId
StatementId: TypeAlias = IncrementingId

PolisCommentPriorities: TypeAlias = dict[str, float] # str[StatementId]
PolisUserVoteCounts: TypeAlias = dict[str, int] # str[ParticipantId]

class PolisGroupCluster(TypedDict):
id: GroupId
members: list[BaseClusterId]
center: list[float]
center: tuple[float, float]

# Custom type
class PolisGroupClusterExpanded(TypedDict):
id: GroupId
members: list[ParticipantId]
center: list[float]
center: tuple[float, float]

BaseClusterMembership: TypeAlias = list[ParticipantId]

class PolisBaseClusters(TypedDict):
# Each outer list will be the same length, and will be 100 items or less.
id: list[BaseClusterId]
members: list[list[ParticipantId]]
members: list[BaseClusterMembership]
x: list[float]
y: list[float]
count: list[int]

# Use functional form when attributes have hyphens or are string numbers.
PolisRepnessStatement = TypedDict("PolisRepnessStatement", {
"tid": int,
"n-success": int,
"n-trials": int,
"p-success": float,
"p-test": float,
"repness": float,
"repness-test": float,
"repful-for": Literal["agree", "disagree"],
"best-agree": NotRequired[bool],
"n-agree": NotRequired[int],
})

PolisRepness = TypedDict("PolisRepness", {
"0": list[PolisRepnessStatement],
"1": list[PolisRepnessStatement],
"2": NotRequired[list[PolisRepnessStatement]],
"3": NotRequired[list[PolisRepnessStatement]],
"4": NotRequired[list[PolisRepnessStatement]],
})
class PolisRepnessStatement(TypedDict):
tid: int
n_success: int
n_trials: int
p_success: float
p_test: float
repness: float
repness_test: float
repful_for: Literal["agree", "disagree"]
best_agree: NotRequired[bool]
n_agree: NotRequired[int]

PolisRepness: TypeAlias = dict[str, list[PolisRepnessStatement]] # str[GroupId]

PerBaseVoteCounts: TypeAlias = list[int]

class PolisBaseClusterVoteSummary(TypedDict):
A: PerBaseVoteCounts
D: PerBaseVoteCounts
S: PerBaseVoteCounts

class PolisPCA(TypedDict):
# Each outer list will be the same length, one item for each statement.
center: list[float]
comps: tuple[list[float], list[float]]
comment_projection: tuple[list[float], list[float]]
comment_extremity: list[float]

class PolisConsensusStatement(TypedDict):
tid: StatementId
n_success: int
n_trials: int
p_success: float
p_test: float

class PolisConsensus(TypedDict):
agree: list[PolisConsensusStatement]
disagree: list[PolisConsensusStatement]

class PolisStatementVoteSummary(TypedDict):
A: int
D: int
S: int

class PolisGroupVote(TypedDict):
n_members: int
votes: dict[str, PolisStatementVoteSummary] # str[StatementId]
id: GroupId

PolisGroupVotes: TypeAlias = dict[str, PolisGroupVote] # str[GroupId]

class PolisMath(TypedDict):
tids: list[StatementId]
meta_tids: list[StatementId]
mod_in: list[StatementId]
mod_out: list[StatementId]

in_conv: list[ParticipantId]
user_vote_counts: PolisUserVoteCounts

# For drawing graph
pca: PolisPCA
group_clusters: list[PolisGroupCluster]

# Consensus statements
consensus: PolisConsensus

# Group statements (representative)
repness: PolisRepness
group_votes: PolisGroupVotes

# Overall statements
comment_priorities: PolisCommentPriorities
group_aware_consensus: dict[str, float] # str[StatementId]

# Base clusters
base_clusters: PolisBaseClusters
votes_base: dict[str, PolisBaseClusterVoteSummary] # str[StatementId]

n: int
n_cmts: int
lastModTimestamp: None
lastVoteTimestamp: UnixTimestampMillisec
math_tick: int
91 changes: 80 additions & 11 deletions reddwarf/utils/pca.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from numpy.typing import ArrayLike
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
Expand Down Expand Up @@ -86,29 +87,97 @@ def scale_projected_data(

return projected_data * participant_scaling_coeffs

def sparsity_aware_project_ptpt(votes, comps, center):
# TODO: Clean up variables and docs.
def sparsity_aware_project_ptpt(participant_votes, statement_components, statement_means):
"""
Projects a sparse vote vector into PCA space while adjusting for sparsity.

Args:
participant_votes (list): List of participant votes on each statement
statement_components (list[list[float]]): Two lists of floats corresponding to the two principal components
statement_means (list[float]): List of floats corresponding to the centers/means of each statement

Returns:
projected_coords (list[list[float]]): Two lists corresponding to projected xy coordinates.
"""
comps = np.array(comps) # Shape: (2, n_features)
center = np.array(center) # Shape: (n_features,)
statement_components = np.array(statement_components) # Shape: (2, n_features)
statement_means = np.array(statement_means) # Shape: (n_features,)

# TODO: This included zerod out (moderated) statements. Should it?
n_cmnts = len(votes)
n_statements = len(participant_votes)

ptpt_votes = np.array(votes)
statement_mask = ~np.isnan(votes) # Only consider non-null values
participant_votes = np.array(participant_votes)
mask = ~np.isnan(participant_votes) # Only consider non-null values

# Extract relevant values
x_vals = ptpt_votes[statement_mask] - center[statement_mask] # Centered values
x_vals = participant_votes[mask] - statement_means[mask] # Centered values
# TODO: Extend this to work in 3D
pc1_vals, pc2_vals = comps[:, statement_mask] # Select only used components
pc1_vals, pc2_vals = statement_components[:, mask] # Select only used components

# Compute dot product projection
p1 = np.dot(x_vals, pc1_vals)
p2 = np.dot(x_vals, pc2_vals)

n_votes = np.count_nonzero(statement_mask) # Non-null votes count
scale = np.sqrt(n_cmnts / max(n_votes, 1))
# Non-null votes count
n_votes = np.count_nonzero(mask)
scale = np.sqrt(n_statements / max(n_votes, 1))

projected_coord = scale * np.array([p1, p2])

return projected_coord

# TODO: Clean up variables and docs.
def sparsity_aware_project_ptpts(vote_matrix, statement_components, statement_means):
"""
Apply sparsity-aware projection to multiple vote vectors.
"""
return np.array([
sparsity_aware_project_ptpt(participant_votes, statement_components, statement_means)
for participant_votes in vote_matrix]
)

# TODO: Clean up variables and docs.
def pca_project_cmnts(statement_components, statement_means):
"""
Projects unit vectors for each feature into PCA space to understand their placement.
"""
n_statements = len(statement_means)
# Create a matrix of virtual participants that each vote once on a single statement.
virtual_vote_matrix = np.full(shape=[n_statements, n_statements], fill_value=np.nan)
for i in range(n_statements):
# TODO: Why does Polis use -1 (disagree) here? is it the same? BUG?
virtual_vote_matrix[i][i] = -1 # Create unit vector representation

# 40 xy pairs. shape (40, 2)
statement_projections = sparsity_aware_project_ptpts(
virtual_vote_matrix,
statement_components,
statement_means,
)

return statement_projections

def calculate_extremity(projections: ArrayLike):
# Compute extremity as vector magnitude on rows.
# vector magnitude = Euclidean norm = hypotenuse of xy
return np.linalg.norm(projections, axis=0)

# TODO: Clean up variables and docs.
def with_proj_and_extremity(pca):
"""
Compute projection and extremity, then merge into PCA results.
"""
statement_projections = pca_project_cmnts(
statement_components=pca["comps"],
statement_means=pca["center"],
)
# Flip the axes to get all x together and y together.
# 2 sets of 40. shape (2, 40)
statement_projections = statement_projections.transpose()

statement_extremities = calculate_extremity(statement_projections)

pca["comment-projection"] = statement_projections.tolist()
pca["comment-extremity"] = statement_extremities.tolist()

return scale * np.array([p1, p2])
return pca
18 changes: 17 additions & 1 deletion reddwarf/utils/statements.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,28 @@
import pandas as pd
from reddwarf.models import ModeratedEnum

def process_statements(statement_data=[]):
def process_statements(
statement_data: list[dict] = [],
) -> tuple[pd.DataFrame, list, list, list]:
"""
Process raw statement data into a dataframe, and various lists of participant IDs.

Args:
statement_data (list[dict]): raw list of statement data dicts

Returns:
statements_df (pd.DataFrame): Dataframe of statements
mod_in_statement_ids (list): List of statement IDs to moderate in
mod_out_statement_ids (list): List of statement IDs to moderate out
meta_statement_ids (list): List of meta statement IDs

"""
mod_in_statement_ids = []
mod_out_statement_ids = []
meta_statement_ids = []

statements_df = (pd.DataFrame
# TODO: See if both "moderated" and "mod" can end up in here. BUG?
.from_records(statement_data)
.set_index('statement_id')
.sort_index()
Expand Down
Loading