from scipy.stats import pearsonr import pandas as pd import seaborn as sns
defcorr_full(df, numeric_only=True, rows=['corr', 'p-value', 'obs']): ''' Generates a correlation matrix with correlation coefficients, p-values, and observation count.
Args: - df: Input dataframe - numeric_only (bool): Whether to consider only numeric columns for correlation. Default is True. - rows: Determines the information to show. Default is ['corr', 'p-value', 'obs'].
Returns: - formatted_table: The correlation matrix with the specified rows. '''
# Calculate the p-values using scipy's pearsonr pvalue_matrix = df.corr( numeric_only=numeric_only, method=lambda x, y: pearsonr(x, y)[1])
# Calculate the non-null observation count for each column obs_count = df.apply(lambda x: x.notnull().sum())
# Calculate observation count for each pair of columns obs_matrix = pd.DataFrame( index=corr_matrix.columns, columns=corr_matrix.columns) for col1 in obs_count.index: for col2 in obs_count.index: obs_matrix.loc[col1, col2] = min(obs_count[col1], obs_count[col2])
# Create a multi-index dataframe to store the formatted correlations formatted_table = pd.DataFrame( index=pd.MultiIndex.from_product([corr_matrix.columns, rows]), columns=corr_matrix.columns )
# Assign values to the appropriate cells in the formatted table for col1 in corr_matrix.columns: for col2 in corr_matrix.columns: if'corr'in rows: formatted_table.loc[ (col1, 'corr'), col2] = corr_matrix.loc[col1, col2]
if'p-value'in rows: # Avoid p-values for diagonal they correlate perfectly if col1 != col2: formatted_table.loc[ (col1, 'p-value'), col2] = f'({pvalue_matrix.loc[col1, col2]:.4f})' if'obs'in rows: formatted_table.loc[ (col1, 'obs'), col2] = obs_matrix.loc[col1, col2]