from pgmpy.models import BayesianModel, BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Loading Data
df = pd.read_csv('Financial_well_being_literacy_Romania_csv.csv', encoding='cp1252')

# Check the first 5 rows of the data
df.head()

# Showing the column
columns = df.columns
columns

Index(['id', 'NUTS3', 'NUTS2', 'SD1', 'SD1a', 'SD2', 'SD3', 'age', 'age2',
       'SD4', 'SD5', 'SD6', 'SD7', 'SD8', 'SD9', 'I1', 'I2_1', 'I2_2', 'I2_3',
       'I2_4', 'I2_5', 'I2_6', 'I2_7', 'I2_8', 'I2_0', 'I3_1', 'I3_2', 'I3_3',
       'I3_4', 'I3_5', 'I3_6', 'I4_1', 'I4_2', 'I4_3', 'I4_4', 'I5', 'I6_1',
       'I6_2', 'I6_3', 'I6_4', 'I6_5', 'I6_7', 'I6_8', 'I6_9', 'I7_1', 'I7_2',
       'I7_3', 'I7_4', 'I7_5', 'I7_6', 'I7_0', 'A1_1', 'A1_2', 'A1_3', 'A1_4',
       'A1_5', 'A1_6', 'B1_1', 'B1_2', 'B1_3', 'B1_4', 'C1', 'C2', 'C3', 'C4',
       'C5', 'C6', 'C7', 'C8', 'weight', 'age16_61', 'self', 'FWB'],
      dtype='object')

# Checking for the total number of null values in each columns
for col_name in df.columns:
  count_nan = df[col_name].isna().sum()
  print (f"Column = {col_name}; Count of NaN = {count_nan}")

Column = id; Count of NaN = 0
Column = NUTS3; Count of NaN = 0
Column = NUTS2; Count of NaN = 0
Column = SD1; Count of NaN = 0
Column = SD1a; Count of NaN = 0
Column = SD2; Count of NaN = 0
Column = SD3; Count of NaN = 0
Column = age; Count of NaN = 0
Column = age2; Count of NaN = 0
Column = SD4; Count of NaN = 0
Column = SD5; Count of NaN = 0
Column = SD6; Count of NaN = 0
Column = SD7; Count of NaN = 0
Column = SD8; Count of NaN = 0
Column = SD9; Count of NaN = 0
Column = I1; Count of NaN = 0
Column = I2_1; Count of NaN = 0
Column = I2_2; Count of NaN = 0
Column = I2_3; Count of NaN = 0
Column = I2_4; Count of NaN = 0
Column = I2_5; Count of NaN = 0
Column = I2_6; Count of NaN = 0
Column = I2_7; Count of NaN = 0
Column = I2_8; Count of NaN = 0
Column = I2_0; Count of NaN = 0
Column = I3_1; Count of NaN = 997
Column = I3_2; Count of NaN = 997
Column = I3_3; Count of NaN = 997
Column = I3_4; Count of NaN = 997
Column = I3_5; Count of NaN = 996
Column = I3_6; Count of NaN = 997
Column = I4_1; Count of NaN = 0
Column = I4_2; Count of NaN = 0
Column = I4_3; Count of NaN = 0
Column = I4_4; Count of NaN = 0
Column = I5; Count of NaN = 0
Column = I6_1; Count of NaN = 0
Column = I6_2; Count of NaN = 0
Column = I6_3; Count of NaN = 0
Column = I6_4; Count of NaN = 0
Column = I6_5; Count of NaN = 0
Column = I6_7; Count of NaN = 0
Column = I6_8; Count of NaN = 0
Column = I6_9; Count of NaN = 0
Column = I7_1; Count of NaN = 0
Column = I7_2; Count of NaN = 0
Column = I7_3; Count of NaN = 0
Column = I7_4; Count of NaN = 0
Column = I7_5; Count of NaN = 0
Column = I7_6; Count of NaN = 0
Column = I7_0; Count of NaN = 0
Column = A1_1; Count of NaN = 0
Column = A1_2; Count of NaN = 0
Column = A1_3; Count of NaN = 0
Column = A1_4; Count of NaN = 0
Column = A1_5; Count of NaN = 0
Column = A1_6; Count of NaN = 0
Column = B1_1; Count of NaN = 0
Column = B1_2; Count of NaN = 0
Column = B1_3; Count of NaN = 0
Column = B1_4; Count of NaN = 0
Column = C1; Count of NaN = 0
Column = C2; Count of NaN = 0
Column = C3; Count of NaN = 0
Column = C4; Count of NaN = 0
Column = C5; Count of NaN = 0
Column = C6; Count of NaN = 0
Column = C7; Count of NaN = 0
Column = C8; Count of NaN = 0
Column = weight; Count of NaN = 0
Column = age16_61; Count of NaN = 0
Column = self; Count of NaN = 0
Column = FWB; Count of NaN = 0

df['SD2'].describe()

count       1391
unique         2
top       Female
freq         722
Name: SD2, dtype: object

# Printing unique value in a list format
for unique_value in list(set(df['SD2'])):
    print(unique_value)

Male
Female

# Histogram plot for visualization
df['SD2'].hist()

<Axes: >

# Get the count of each unique value
pd.crosstab(df['SD2'], 'Count')

# Get the probability by normalizing the counts of unique value
gender_probability = pd.crosstab(df['SD2'], 'Probability', normalize=True)
gender_probability

df['SD3'].describe()

count    1391.000000
mean       47.386053
std        15.071763
min        16.000000
25%        36.000000
50%        47.000000
75%        58.000000
max        90.000000
Name: SD3, dtype: float64

# Histogram plot for visualization
df['SD3'].hist(figsize=(7,5))

<Axes: >

# Setting age labels
age_labels = [
    '16-36', '37-56', '57-76', '76+'
]

# Edge value of each bins
age_groups = [15,36,56,76,90]

# Sort according to the age groups set
df['SD3_processed'] = pd.cut(df['SD3'], age_groups, labels=age_labels)
df['SD3_processed']

0       37-56
1       16-36
2       37-56
3       37-56
4       57-76
        ...  
1386    37-56
1387    37-56
1388    16-36
1389    37-56
1390    37-56
Name: SD3_processed, Length: 1391, dtype: category
Categories (4, object): ['16-36' < '37-56' < '57-76' < '76+']

# Bar plot for visualization
df['SD3_processed'].value_counts().loc[age_labels].plot.bar()

<Axes: xlabel='SD3_processed'>

# Setting age labels
age_labels = [
    '16-36', '37-56', '57+'
]

# Edge value of each bins
age_groups = [15,36,56,90]

# Sort according to the age groups set
df['SD3_processed'] = pd.cut(df['SD3'], age_groups, labels=age_labels)
df['SD3_processed']

0       37-56
1       16-36
2       37-56
3       37-56
4         57+
        ...  
1386    37-56
1387    37-56
1388    16-36
1389    37-56
1390    37-56
Name: SD3_processed, Length: 1391, dtype: category
Categories (3, object): ['16-36' < '37-56' < '57+']

# Bar plot for visualization
df['SD3_processed'].value_counts().loc[age_labels].plot.bar()

<Axes: xlabel='SD3_processed'>

# Get the count of each unique value
pd.crosstab(df['SD3_processed'], 'Count')

# Get the probability by normalizing the counts of unique value
age_probability = pd.crosstab(df['SD3_processed'], 'Probability', normalize=True)
age_probability

df["SD4"].describe()

count                        1391
unique                          5
top       High school (12 grades)
freq                          697
Name: SD4, dtype: object

# Printing unique value in a list format
for unique_value in list(set(df['SD4'])):
	print(unique_value)

High school (12 grades)
Bachelor and master education
Middle school (8 grades)
Primary school (4 grades) 
Post-graduate education

# Educational Attainment in ascending order
education_order = [
    'Primary school (4 grades) ',
    'Middle school (8 grades)',
    'High school (12 grades)',
    'Bachelor and master education',
    'Post-graduate education'
]

# Bar plot for visualization
df['SD4'].value_counts().loc[education_order].plot.bar()

<Axes: xlabel='SD4'>

education_group_changes = {
    'Primary school (4 grades) ': 'Middle school (8 grades) and below',
    'Middle school (8 grades)': 'Middle school (8 grades) and below',
    'Bachelor and master education': 'Bachelor and above',
    'Post-graduate education': 'Bachelor and above'
}

df['SD4_processed'] = df['SD4'].replace(education_group_changes)
df['SD4_processed']

0                  High school (12 grades)
1                       Bachelor and above
2                       Bachelor and above
3                  High school (12 grades)
4       Middle school (8 grades) and below
                       ...                
1386                    Bachelor and above
1387               High school (12 grades)
1388               High school (12 grades)
1389               High school (12 grades)
1390               High school (12 grades)
Name: SD4_processed, Length: 1391, dtype: object

# Get the count of each unique value
pd.crosstab(df['SD4_processed'], 'Count')

# Get the probability by normalizing the counts of unique value
education_probability = pd.crosstab(df['SD4_processed'], 'Probability', normalize=True)
education_probability

# Educational Attainment in ascending order
education_order = [
    'Middle school (8 grades) and below',
    'High school (12 grades)',
    'Bachelor and above'
]

# Bar plot for visualization
df['SD4_processed'].value_counts().loc[education_order].plot.bar()

<Axes: xlabel='SD4_processed'>

df['I1'].describe()

count                                                  1391
unique                                                    4
top       No, we don’t keep records, but we know how muc...
freq                                                    550
Name: I1, dtype: object

# Printing unique value in a list format
for unique_value in list(set(df['I1'])):
    print(unique_value)

Yes, we keep records, but not all revenues and expenses are recorded
No, we don’t keep records, but we know how much money we earn and spend during a month
No, we don’t keep records, and we don’t know how much money we earn and spend during a month
Yes, we keep records of all revenues and all expenses

# Score given to each of the statement
record_keeping_assessment = {
    'Yes, we keep records of all revenues and all expenses': 3,
    'No, we don’t keep records, and we don’t know how much money we earn and spend during a month':0,
    'No, we don’t keep records, but we know how much money we earn and spend during a month': 1,
    'Yes, we keep records, but not all revenues and expenses are recorded':2
}

# Replacing the statement with the score
df['I1_processed'] = df['I1'].replace(record_keeping_assessment)

# Check the unprocessed column with the processed column
df[['I1','I1_processed']]

/var/folders/4d/3zg2grqx5kj9w9kfm1mqfcvw0000gn/T/ipykernel_26276/1258281535.py:10: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df['I1_processed'] = df['I1'].replace(record_keeping_assessment)

# Bar plot for visualization
df['I1_processed'].value_counts().loc[[0,1,2,3]].plot.bar()

<Axes: xlabel='I1_processed'>

# Create a list for the columns
list_of_columns_for_money_invested = []
for i in range(1,9):
    list_of_columns_for_money_invested.append("I2_" + str(i))

df[['I2_0',*list_of_columns_for_money_invested]]

# Score given to each of financial instrument, these are the values applied if it is a yes.
financial_instrument_assessment = [2,3,3,3,3,3,3,1]

# Looping through the columns and replacing yes with the values listed in the list above, while a no will be 0
for no, column in enumerate(list_of_columns_for_money_invested):
    df[column + '_processed'] = df[column].apply(
        lambda x: financial_instrument_assessment[no] if x == 'Yes' else 0
    )

df['I2_0_processed'] = df['I2_0'].apply(lambda x: 0 if x == "Yes" else 1)

# Create a list for the processed columns
list_of_columns_for_money_invested_processed = [x + '_processed' for x in list_of_columns_for_money_invested]

# Include the first column that was processed separately
list_of_columns_for_money_invested_processed.append('I2_0_processed')

# Sort the list in ascending order.
list_of_columns_for_money_invested_processed.sort()

df[list_of_columns_for_money_invested_processed]

# Sum all the processed columns of I2
df['I2_processed'] = df[list_of_columns_for_money_invested_processed].sum(axis=1)
df['I2_processed']

0       9
1       7
2       9
3       2
4       4
       ..
1386    9
1387    4
1388    4
1389    0
1390    0
Name: I2_processed, Length: 1391, dtype: int64

# Bar plot for visualization
df['I2_processed'].value_counts().plot.bar(figsize=(10,7))

<Axes: xlabel='I2_processed'>

# Showing both columns together
df[['I2_processed', 'I1_processed']]

# Combining both I1 Processed and I2 Processed for Financial Behaviour and Attitude Index
df['I1&I2'] = ( (df['I2_processed'] * 0.5) + (df['I1_processed'] * 0.5) )
df['I1&I2']

0       5.5
1       4.0
2       5.5
3       2.0
4       2.5
       ... 
1386    5.5
1387    2.5
1388    2.5
1389    1.5
1390    0.5
Name: I1&I2, Length: 1391, dtype: float64

df['I1&I2'].describe()

count    1391.000000
mean        1.706326
std         1.268400
min         0.000000
25%         0.500000
50%         1.500000
75%         2.500000
max         6.500000
Name: I1&I2, dtype: float64

# Visualize the data
df['I1&I2'].hist()

<Axes: >

# Setting the Financial Behaviour labels
financial_behaviour_score_labels = ['0-2', '2-4', '4+']

# Edge values for each bin
financial_behaviour_groups = [-1,2,4,7]

# Group the values into the three groups
df['Financial Behaviour'] = pd.cut(df['I1&I2'], financial_behaviour_groups, labels=financial_behaviour_score_labels)

df[['I1&I2','Financial Behaviour']]

# Visualize final financial behaviour column
df['Financial Behaviour'].value_counts().plot.bar()

<Axes: xlabel='Financial Behaviour'>

# Get the count of each unique value
pd.crosstab(df['Financial Behaviour'], 'Count')

# Get the probability by normalizing the counts of unique value
financial_behaviour_probability = pd.crosstab(df['Financial Behaviour'], 'Probability', normalize=True)
financial_behaviour_probability

df['C1'].describe()

count        1391
unique          4
top       1 in 10
freq          537
Name: C1, dtype: object

# Printing unique value in a list format
for unique_value in list(set(df['C1'])):
	print(unique_value)

1 in 1,000,000
1 in 10
1 in 1,000
Don’t know

list_of_columns_for_financial_literacy = []

# Make a dictionary based on the values awarded
c1_score = {
    'Don’t know':0,
    '1 in 10':3,
    '1 in 1,000':2,
    '1 in 1,000,000':1
}

# Temp variable for new column name
temp_col = 'C1_processed'

# Append new column name
list_of_columns_for_financial_literacy.append(temp_col)

df[temp_col] = df['C1'].replace(c1_score)
df[['C1',temp_col]]

/var/folders/4d/3zg2grqx5kj9w9kfm1mqfcvw0000gn/T/ipykernel_26276/1883059767.py:15: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df[temp_col] = df['C1'].replace(c1_score)

df['C2'].describe()

count                   1391
unique                     4
top       More than LEI 150 
freq                     464
Name: C2, dtype: object

# Printing unique value in a list format
for unique_value in list(set(df['C2'])):
	print(unique_value)

Don’t know
More than LEI 150 
Less than LEI 150
Exactly LEI 150

# Make a dictionary based on the values awarded
c2_score = {
    'Don’t know':0,
    'More than LEI 150 ':3,
    'Exactly LEI 150 ':2,
    'Less than LEI 150':1
}

# Temp variable for new column name
temp_col = 'C2_processed'

# Append new column name
list_of_columns_for_financial_literacy.append(temp_col)

df[temp_col] = df['C2'].replace(c2_score)
df[['C2',temp_col]]

/var/folders/4d/3zg2grqx5kj9w9kfm1mqfcvw0000gn/T/ipykernel_26276/670163980.py:15: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df[temp_col] = df['C2'].replace(c2_score)

df['C3'].describe()

count     1391
unique       3
top       True
freq       753
Name: C3, dtype: object

# Printing unique value in a list format
for unique_value in list(set(df['C3'])):
	print(unique_value)

Don’t know
True
False

# Make a dictionary based on the values awarded
c3_score = {
    'Don’t know':0,
    'True':3,
    'False':1
}

# Temp variable for new column name
temp_col = 'C3_processed'

# Append new column name
list_of_columns_for_financial_literacy.append(temp_col)

df[temp_col] = df['C3'].replace(c3_score)
df[['C3',temp_col]]

/var/folders/4d/3zg2grqx5kj9w9kfm1mqfcvw0000gn/T/ipykernel_26276/1063833420.py:14: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df[temp_col] = df['C3'].replace(c3_score)

df['C4'].describe()

count                      1391
unique                        4
top       They are equally rich
freq                        516
Name: C4, dtype: object

# Printing unique value in a list format
for unique_value in list(set(df['C4'])):
	print(unique_value)

His sibling
They are equally rich
My friend
Don’t know

# Make a dictionary based on the values awarded
c4_score = {
    'Don’t know':0,
    'My friend':3,
    'They are equally rich': 2,
    'His sibling':1
}

# Temp variable for new column name
temp_col = 'C4_processed'

# Append new column name
list_of_columns_for_financial_literacy.append(temp_col)

df[temp_col] = df['C4'].replace(c4_score)
df[['C4',temp_col]]

/var/folders/4d/3zg2grqx5kj9w9kfm1mqfcvw0000gn/T/ipykernel_26276/3573706696.py:15: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df[temp_col] = df['C4'].replace(c4_score)

df['C5'].describe()

count         1391
unique           4
top       The same
freq           635
Name: C5, dtype: object

# Printing unique value in a list format
for unique_value in list(set(df['C5'])):
	print(unique_value)

More
The same
Less
Don’t know

# Make a dictionary based on the values awarded
c5_score = {
    'Don’t know':0,
    'The same':3,
    'Less': 2,
    'More':1
}

# Temp variable for new column name
temp_col = 'C5_processed'

# Append new column name
list_of_columns_for_financial_literacy.append(temp_col)

df[temp_col] = df['C5'].replace(c5_score)
df[['C5',temp_col]]

/var/folders/4d/3zg2grqx5kj9w9kfm1mqfcvw0000gn/T/ipykernel_26276/4079600377.py:15: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df[temp_col] = df['C5'].replace(c5_score)

df['C6'].describe()

count                                 1391
unique                                   3
top       Multiple business of investments
freq                                   566
Name: C6, dtype: object

# Printing unique value in a list format
for unique_value in list(set(df['C6'])):
	print(unique_value)

Multiple business of investments
One business or investment
Don’t know

# Make a dictionary based on the values awarded
c6_score = {
    'Don’t know':0,
    'Multiple business of investments':3,
    'One business or investment':1
}

# Temp variable for new column name
temp_col = 'C6_processed'

# Append new column name
list_of_columns_for_financial_literacy.append(temp_col)

df[temp_col] = df['C6'].replace(c6_score)
df[['C6',temp_col]]

/var/folders/4d/3zg2grqx5kj9w9kfm1mqfcvw0000gn/T/ipykernel_26276/982456800.py:14: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df[temp_col] = df['C6'].replace(c6_score)

df['C7'].describe()

count           1391
unique             4
top       Don’t know
freq             541
Name: C7, dtype: object

# Printing unique value in a list format
for unique_value in list(set(df['C7'])):
    print(unique_value)

Savings deposit
Bonds
Stocks
Don’t know

# Make a dictionary based on the values awarded
c7_score = {
    'Don’t know':0,
    'Stocks':3,
    'Bonds': 2,
    'Savings deposit':1
}

# Temp variable for new column name
temp_col = 'C7_processed'

# Append new column name
list_of_columns_for_financial_literacy.append(temp_col)

df[temp_col] = df['C7'].replace(c7_score)
df[['C7',temp_col]]

/var/folders/4d/3zg2grqx5kj9w9kfm1mqfcvw0000gn/T/ipykernel_26276/4083590857.py:15: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df[temp_col] = df['C7'].replace(c7_score)

df['C8'].describe()

count           1391
unique             4
top       Don’t know
freq             576
Name: C8, dtype: object

# Printing unique value in a list format
for unique_value in list(set(df['C8'])):
	print(unique_value)

Savings deposit
Bonds
Stocks
Don’t know

# Make a dictionary based on the values awarded
c8_score = {
    'Don’t know':0,
    'Stocks':3,
    'Bonds': 2,
    'Savings deposit':1
}

# Temp variable for new column name
temp_col = 'C8_processed'

# Append new column name
list_of_columns_for_financial_literacy.append(temp_col)

df[temp_col] = df['C8'].replace(c8_score)
df[['C8',temp_col]]

/var/folders/4d/3zg2grqx5kj9w9kfm1mqfcvw0000gn/T/ipykernel_26276/1833694699.py:15: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df[temp_col] = df['C8'].replace(c8_score)

df[list_of_columns_for_financial_literacy]

# Getting the sum of the 8 columns into a column
df['C1-C8'] = df[list_of_columns_for_financial_literacy].sum(axis=1)
df[[*list_of_columns_for_financial_literacy,'C1-C8']]

df['C1-C8'].describe()

count    1391.000000
mean       13.636233
std         5.604593
min         0.000000
25%        10.000000
50%        15.000000
75%        18.000000
max        24.000000
Name: C1-C8, dtype: float64

df['C1-C8'].hist()

<Axes: >

# Setting the Financial Behaviour labels
financial_literacy_score_labels = ['0-5', '6-10', '11-15', '16-20', '+20']

# Edge values for each bin
financial_literacy_groups = [-1,5,10,15,20,25]

# Group the values into the three groups
df['Financial Literacy'] = pd.cut(df['C1-C8'], financial_literacy_groups, labels=financial_literacy_score_labels)

df[['C1-C8','Financial Literacy']]

# Visualize final financial literacy column
df['Financial Literacy'].value_counts().loc[financial_literacy_score_labels].plot.bar()

<Axes: xlabel='Financial Literacy'>

# Get the count of each unique value
pd.crosstab(df['Financial Literacy'], 'Count')

# Get the probability by normalizing the counts of unique value
financial_literacy_probability = pd.crosstab(df['Financial Literacy'], 'Probability', normalize=True)
financial_literacy_probability

df['FWB'].describe()

count    1391.000000
mean       51.118620
std         9.963438
min        16.000000
25%        45.000000
50%        51.000000
75%        57.000000
max        91.000000
Name: FWB, dtype: float64

df['FWB'].hist()

<Axes: >

# Setting the Financial Behaviour labels
financial_well_being_labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']

# Group the values into the 5 groups
df['Financial Well Being'] = pd.cut(df['FWB'], 5, labels=financial_well_being_labels)

df[['FWB','Financial Well Being']]

# Visualize the final financial well being column
df['Financial Well Being'].value_counts().loc[financial_well_being_labels].plot.bar()

<Axes: xlabel='Financial Well Being'>

# Get the count of each unique group
pd.crosstab(df['Financial Well Being'], 'Count')

# Get the probability by normalizing the counts of unique value
financial_well_being_probability = pd.crosstab(df['Financial Well Being'], 'Probability', normalize=True)
financial_well_being_probability

model = BayesianNetwork(
    [
        ('Gender', 'Financial Literacy'),
        ('Gender', 'Financial Behaviour'),
        ('Age', 'Financial Literacy'),
        ('Age', 'Financial Behaviour'),
        ('Education', 'Financial Literacy'),
        ('Education', 'Financial Behaviour'),
        ('Financial Literacy', 'Financial Well Being'),
        ('Financial Behaviour', 'Financial Well Being')
    ]
)

gender_probability

gender_cpd = TabularCPD(
    variable='Gender',
    variable_card=2,
    values = [[0.519051],[0.480949]],
    state_names={
        'Gender':[
            'Female',
            'Male'
        ]
    }
)

model.add_cpds(gender_cpd)

print(model.get_cpds('Gender'))

+----------------+----------+
| Gender(Female) | 0.519051 |
+----------------+----------+
| Gender(Male)   | 0.480949 |
+----------------+----------+

age_probability

age_cpd = TabularCPD(
    variable='Age',
    variable_card=3,
    values=[
        [0.26312],[0.46729],[0.26959]
    ],
    state_names={
        'Age':[
            '16-36', '37-56', '76+'
        ]
    }
)

model.add_cpds(age_cpd)

print(model.get_cpds('Age'))

+------------+---------+
| Age(16-36) | 0.26312 |
+------------+---------+
| Age(37-56) | 0.46729 |
+------------+---------+
| Age(76+)   | 0.26959 |
+------------+---------+

education_probability

income_cpd = TabularCPD(
    variable='Education',
    variable_card=3,
    values=[
        [0.3932],[0.5011],[0.1057],
    ],
    state_names={
        'Education':[
            'Middle school (8 grades) and below',
            'High school (12 grades)',
            'Bachelor and above',
        ]
    }
)

model.add_cpds(income_cpd)

print(model.get_cpds('Education'))

+-----------------------------------------------+--------+
| Education(Middle school (8 grades) and below) | 0.3932 |
+-----------------------------------------------+--------+
| Education(High school (12 grades))            | 0.5011 |
+-----------------------------------------------+--------+
| Education(Bachelor and above)                 | 0.1057 |
+-----------------------------------------------+--------+

sociodemographic_columns = []
sociodemographic_columns.append('SD2')
sociodemographic_columns.append('SD3_processed')
sociodemographic_columns.append('SD4_processed')

FL_socio_probability = pd.crosstab(
    df['Financial Literacy'],
    [df['SD2'],df['SD3_processed'],df['SD4_processed']],
    normalize='columns',dropna=False,margins=True
)

#FL_socio_probability = FL_socio_probability.applymap('{:.4f}'.format)
#FL_socio_probability = FL_socio_probability.astype(np.float16)

FL_socio_probability

# Check for sum of every column 
FL_testing = FL_socio_probability[FL_socio_probability.columns].sum()

for FL in FL_testing:
    print(FL)

print('Sum: ', sum(FL_testing))

1.0
1.0
1.0
1.0
1.0
1.0
0.9999999999999999
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.9999999999999999
1.0
0.9999999999999999
1.0
1.0
Sum:  19.0

distinct_value_count_for_socio_col = [len(list(set(df[x]))) for x in sociodemographic_columns]
distinct_value_count_for_socio_col

[2, 3, 3]

[x for x in FL_socio_probability.index]

['0-5', '6-10', '11-15', '16-20', '+20']

FL_socio_cpd = TabularCPD(
    variable='Financial Literacy',
    variable_card=5,
    evidence=['Gender', 'Age','Education'],
    evidence_card=distinct_value_count_for_socio_col,
    values = [
        [x for x in FL_socio_probability.loc['0-5'][:-1]],
        [x for x in FL_socio_probability.loc['6-10'][:-1]],
        [x for x in FL_socio_probability.loc['11-15'][:-1]],
        [x for x in FL_socio_probability.loc['16-20'][:-1]],
        [x for x in FL_socio_probability.loc['+20'][:-1]]
    ],
    state_names = {
        'Financial Literacy':[x for x in FL_socio_probability.index],

        'Gender':[
            'Female',
            'Male'
        ],

        'Age':[
            '16-36', '37-56', '76+'
        ],

        'Education':[
            'Middle school (8 grades) and below',
            'High school (12 grades)',
            'Bachelor and above',
        ]
    }
)

model.add_cpds(FL_socio_cpd)

print(model.get_cpds('Financial Literacy'))

+---------------------------+-----+-------------------------------+
| Gender                    | ... | Gender(Male)                  |
+---------------------------+-----+-------------------------------+
| Age                       | ... | Age(76+)                      |
+---------------------------+-----+-------------------------------+
| Education                 | ... | Education(Bachelor and above) |
+---------------------------+-----+-------------------------------+
| Financial Literacy(0-5)   | ... | 0.5                           |
+---------------------------+-----+-------------------------------+
| Financial Literacy(6-10)  | ... | 0.325                         |
+---------------------------+-----+-------------------------------+
| Financial Literacy(11-15) | ... | 0.15                          |
+---------------------------+-----+-------------------------------+
| Financial Literacy(16-20) | ... | 0.025                         |
+---------------------------+-----+-------------------------------+
| Financial Literacy(+20)   | ... | 0.0                           |
+---------------------------+-----+-------------------------------+

df['Financial Behaviour']

0        4+
1       2-4
2        4+
3       0-2
4       2-4
       ... 
1386     4+
1387    2-4
1388    2-4
1389    0-2
1390    0-2
Name: Financial Behaviour, Length: 1391, dtype: category
Categories (3, object): ['0-2' < '2-4' < '4+']

FB_socio_probability = pd.crosstab(
    df['Financial Behaviour'],
    [df['SD2'],df['SD3_processed'],df['SD4_processed']],
    normalize='columns',dropna=False,margins=True
)
#FB_socio_probability = FB_socio_probability.applymap('{:.4f}'.format)
#FB_socio_probability = FB_socio_probability.astype(np.float16)
FB_socio_probability

# Check for sum of every column 
FB_Testing = FB_socio_probability[FB_socio_probability.columns].sum()

for FB in FB_Testing:
    print(FB)

print('Sum: ', sum(FB_Testing))

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.9999999999999999
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
Sum:  19.0

FB_socio_cpd = TabularCPD(
    variable='Financial Behaviour',
    variable_card=3,
    evidence=['Gender', 'Age','Education'],
    evidence_card=distinct_value_count_for_socio_col,
    values = [
        [x for x in FB_socio_probability.loc['0-2'][:-1]],
        [x for x in FB_socio_probability.loc['2-4'][:-1]],
        [x for x in FB_socio_probability.loc['4+'][:-1]],
    ],
    state_names = {
        'Financial Behaviour':[x for x in FB_socio_probability.index],

        'Gender':[
            'Female',
            'Male'
        ],

        'Age':[
            '16-36', '37-56', '76+'
        ],

        'Education':[
            'Middle school (8 grades) and below',
            'High school (12 grades)',
            'Bachelor and above',
        ]
    }
)

model.add_cpds(FB_socio_cpd)

print(model.get_cpds('Financial Behaviour'))

+--------------------------+-----+-------------------------------+
| Gender                   | ... | Gender(Male)                  |
+--------------------------+-----+-------------------------------+
| Age                      | ... | Age(76+)                      |
+--------------------------+-----+-------------------------------+
| Education                | ... | Education(Bachelor and above) |
+--------------------------+-----+-------------------------------+
| Financial Behaviour(0-2) | ... | 0.925                         |
+--------------------------+-----+-------------------------------+
| Financial Behaviour(2-4) | ... | 0.075                         |
+--------------------------+-----+-------------------------------+
| Financial Behaviour(4+)  | ... | 0.0                           |
+--------------------------+-----+-------------------------------+

FWB_probability = pd.crosstab(
    df['Financial Well Being'],
    [df['Financial Behaviour'], df['Financial Literacy']],
    normalize='columns', dropna=False, margins=True
)

FWB_probability

# Check for sum of every column 
FWB_Testing = FWB_probability[FWB_probability.columns].sum()

for FWB in FWB_Testing:
    print(FWB)

print('Sum: ', sum(FWB_Testing))

0.9999999999999999
1.0
0.9999999999999999
1.0
1.0
1.0
1.0
0.9999999999999999
1.0
1.0
0.0
1.0
1.0
0.9999999999999999
1.0
1.0
Sum:  15.0

# Visualize the final financial well being column
df['Financial Well Being'].value_counts().loc[financial_well_being_labels].plot.bar()

<Axes: xlabel='Financial Well Being'>

FWB_group_changes = {
    'Very Low': 'Low and Below',
    'Low': 'Low and Below', 
    'High': 'High and Above', 
    'Very High': 'High and Above'
}

df['Financial Well Being_processed'] = df['Financial Well Being'].replace(FWB_group_changes)
df['Financial Well Being_processed']

/var/folders/4d/3zg2grqx5kj9w9kfm1mqfcvw0000gn/T/ipykernel_26276/1069431917.py:8: FutureWarning: The behavior of Series.replace (and DataFrame.replace) with CategoricalDtype is deprecated. In a future version, replace will only be used for cases that preserve the categories. To change the categories, use ser.cat.rename_categories instead.
  df['Financial Well Being_processed'] = df['Financial Well Being'].replace(FWB_group_changes)

0       High and Above
1       High and Above
2       High and Above
3               Medium
4               Medium
             ...      
1386    High and Above
1387            Medium
1388            Medium
1389            Medium
1390            Medium
Name: Financial Well Being_processed, Length: 1391, dtype: category
Categories (3, object): ['Low and Below' < 'Medium' < 'High and Above']

# Get the count of each unique value
pd.crosstab(df['Financial Well Being_processed'], 'Count')

# Financial Well Being 
fwb_processed_order = [
    'Low and Below', 
    'Medium', 
    'High and Above'
]

# Visualize the final financial well being column
df['Financial Well Being_processed'].value_counts().loc[fwb_processed_order].plot.bar()

<Axes: xlabel='Financial Well Being_processed'>

# Get the probability by normalizing the counts of unique value
fwb_processed_probability = pd.crosstab(df['Financial Well Being_processed'], 'Probability', normalize=True)
fwb_processed_probability

FWB_probability = pd.crosstab(
    df['Financial Well Being_processed'],
    [df['Financial Behaviour'], df['Financial Literacy']],
    normalize='columns', dropna=False, margins=True
)

FWB_probability

# Check for sum of every column 
FWB_Testing = FWB_probability[FWB_probability.columns].sum()

for FWB in FWB_Testing:
    print(FWB)

print('Sum: ', sum(FWB_Testing))

0.9999999999999999
1.0
0.9999999999999999
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
Sum:  15.0

# Setting the Financial Behaviour labels
financial_literacy_score_labels = ['Below Average','Above Average']

# Edge values for each bin
financial_literacy_groups = [-1,14,25]

# Group the values into the three groups
df['Financial Literacy_processed'] = pd.cut(df['C1-C8'], financial_literacy_groups, labels=financial_literacy_score_labels)
df['Financial Literacy_processed'].value_counts().loc[financial_literacy_score_labels].plot.bar()

<Axes: xlabel='Financial Literacy_processed'>

# Get the count of each unique value
pd.crosstab(df['Financial Literacy_processed'], 'Count')

FL_socio_probability = pd.crosstab(
    df['Financial Literacy_processed'],
    [df['SD2'],df['SD3_processed'],df['SD4_processed']],
    normalize='columns',dropna=False,margins=True
)

FL_socio_probability

FL_socio_cpd = TabularCPD(
    variable='Financial Literacy',
    variable_card=2,
    evidence=['Gender', 'Age','Education'],
    evidence_card=distinct_value_count_for_socio_col,
    values = [
        [x for x in FL_socio_probability.loc['Below Average'][:-1]],
        [x for x in FL_socio_probability.loc['Above Average'][:-1]]
    ],

    state_names={
        'Financial Literacy':[x for x in FL_socio_probability.index],

        'Age':[
            '16-36', '37-56', '76+'
        ],

        'Gender':[
            'Female',
            'Male'
        ],

        'Education':[
            'Middle school (8 grades) and below',
            'High school (12 grades)',
            'Bachelor and above',
        ]
    }
)

model.add_cpds(FL_socio_cpd)
print(model.get_cpds('Financial Literacy'))

WARNING:pgmpy:Replacing existing CPD for Financial Literacy

+-----------------------------------+-----+-------------------------------+
| Gender                            | ... | Gender(Male)                  |
+-----------------------------------+-----+-------------------------------+
| Age                               | ... | Age(76+)                      |
+-----------------------------------+-----+-------------------------------+
| Education                         | ... | Education(Bachelor and above) |
+-----------------------------------+-----+-------------------------------+
| Financial Literacy(Below Average) | ... | 0.975                         |
+-----------------------------------+-----+-------------------------------+
| Financial Literacy(Above Average) | ... | 0.025                         |
+-----------------------------------+-----+-------------------------------+

FWB_probability = pd.crosstab(
    df['Financial Well Being_processed'],
    [df['Financial Behaviour'], df['Financial Literacy_processed']],
    normalize='columns', dropna=False, margins=True
)

FWB_probability

FWB_socio_cpd = TabularCPD(
    variable='Financial Well Being', 
    variable_card=3,
    evidence=['Financial Behaviour', 'Financial Literacy'],
    evidence_card=[3, 2],
    values = [
        [x for x in FWB_probability.loc['Low and Below'][:-1]],
        [x for x in FWB_probability.loc['Medium'][:-1]],
        [x for x in FWB_probability.loc['High and Above'][:-1]]

    ], 

    state_names = {
        'Financial Literacy': [x for x in FL_socio_probability.index],
        'Financial Behaviour': [x for x in FB_socio_probability.index],
        'Financial Well Being': [x for x in FWB_probability.index]
    }
)

model.add_cpds(FWB_socio_cpd)

infer = VariableElimination(model)
print(infer.query(variables = ['Financial Literacy']))

+-----------------------------------+---------------------------+
| Financial Literacy                |   phi(Financial Literacy) |
+===================================+===========================+
| Financial Literacy(Below Average) |                    0.4850 |
+-----------------------------------+---------------------------+
| Financial Literacy(Above Average) |                    0.5150 |
+-----------------------------------+---------------------------+

print(infer.query(variables = ['Financial Well Being']))

+--------------------------------------+-----------------------------+
| Financial Well Being                 |   phi(Financial Well Being) |
+======================================+=============================+
| Financial Well Being(Low and Below)  |                      0.3055 |
+--------------------------------------+-----------------------------+
| Financial Well Being(Medium)         |                      0.5587 |
+--------------------------------------+-----------------------------+
| Financial Well Being(High and Above) |                      0.1358 |
+--------------------------------------+-----------------------------+

print(infer.query(
    variables = ['Financial Well Being'],
    evidence = {'Financial Literacy':'Above Average'}
))

+--------------------------------------+-----------------------------+
| Financial Well Being                 |   phi(Financial Well Being) |
+======================================+=============================+
| Financial Well Being(Low and Below)  |                      0.2254 |
+--------------------------------------+-----------------------------+
| Financial Well Being(Medium)         |                      0.5894 |
+--------------------------------------+-----------------------------+
| Financial Well Being(High and Above) |                      0.1852 |
+--------------------------------------+-----------------------------+

print(infer.query(
    variables = ['Financial Well Being'],
    evidence = {'Financial Literacy':'Below Average'}
    )
)

+--------------------------------------+-----------------------------+
| Financial Well Being                 |   phi(Financial Well Being) |
+======================================+=============================+
| Financial Well Being(Low and Below)  |                      0.3906 |
+--------------------------------------+-----------------------------+
| Financial Well Being(Medium)         |                      0.5260 |
+--------------------------------------+-----------------------------+
| Financial Well Being(High and Above) |                      0.0833 |
+--------------------------------------+-----------------------------+

print(infer.query(
    variables = ['Financial Well Being'],
    evidence = {'Age':'76+', 'Gender':'Female'}
    )
)

+--------------------------------------+-----------------------------+
| Financial Well Being                 |   phi(Financial Well Being) |
+======================================+=============================+
| Financial Well Being(Low and Below)  |                      0.3304 |
+--------------------------------------+-----------------------------+
| Financial Well Being(Medium)         |                      0.5494 |
+--------------------------------------+-----------------------------+
| Financial Well Being(High and Above) |                      0.1202 |
+--------------------------------------+-----------------------------+

print(infer.query(
    variables = ['Financial Well Being'],
    evidence = {'Age':'16-36', 'Age':'37-56', 'Gender':'Female'}
    )
)

+--------------------------------------+-----------------------------+
| Financial Well Being                 |   phi(Financial Well Being) |
+======================================+=============================+
| Financial Well Being(Low and Below)  |                      0.2887 |
+--------------------------------------+-----------------------------+
| Financial Well Being(Medium)         |                      0.5648 |
+--------------------------------------+-----------------------------+
| Financial Well Being(High and Above) |                      0.1465 |
+--------------------------------------+-----------------------------+

print(infer.query(
    variables = ['Age', 'Gender'],
    evidence = {'Financial Well Being':'Low and Below'}
    )
)

+------------+----------------+-------------------+
| Age        | Gender         |   phi(Age,Gender) |
+============+================+===================+
| Age(16-36) | Gender(Female) |            0.1302 |
+------------+----------------+-------------------+
| Age(16-36) | Gender(Male)   |            0.1214 |
+------------+----------------+-------------------+
| Age(37-56) | Gender(Female) |            0.2292 |
+------------+----------------+-------------------+
| Age(37-56) | Gender(Male)   |            0.2331 |
+------------+----------------+-------------------+
| Age(76+)   | Gender(Female) |            0.1513 |
+------------+----------------+-------------------+
| Age(76+)   | Gender(Male)   |            0.1349 |
+------------+----------------+-------------------+

print(infer.query(
    variables = ['Age', 'Gender'],
    evidence = {'Financial Well Being':'Low and Below', 'Financial Literacy':'Below Average'}
    )
)

+------------+----------------+-------------------+
| Age        | Gender         |   phi(Age,Gender) |
+============+================+===================+
| Age(16-36) | Gender(Female) |            0.1148 |
+------------+----------------+-------------------+
| Age(16-36) | Gender(Male)   |            0.1024 |
+------------+----------------+-------------------+
| Age(37-56) | Gender(Female) |            0.2246 |
+------------+----------------+-------------------+
| Age(37-56) | Gender(Male)   |            0.2289 |
+------------+----------------+-------------------+
| Age(76+)   | Gender(Female) |            0.1796 |
+------------+----------------+-------------------+
| Age(76+)   | Gender(Male)   |            0.1498 |
+------------+----------------+-------------------+

print(infer.query(
    variables = ['Financial Literacy'],
    evidence = {'Age':'76+', 'Gender':'Female'}
    )
)

+-----------------------------------+---------------------------+
| Financial Literacy                |   phi(Financial Literacy) |
+===================================+===========================+
| Financial Literacy(Below Average) |                    0.6193 |
+-----------------------------------+---------------------------+
| Financial Literacy(Above Average) |                    0.3807 |
+-----------------------------------+---------------------------+

print(infer.query(
    variables = ['Financial Literacy'],
    evidence = {'Age':'16-36', 'Age':'37-56', 'Gender':'Female'}
    )
)

+-----------------------------------+---------------------------+
| Financial Literacy                |   phi(Financial Literacy) |
+===================================+===========================+
| Financial Literacy(Below Average) |                    0.4676 |
+-----------------------------------+---------------------------+
| Financial Literacy(Above Average) |                    0.5324 |
+-----------------------------------+---------------------------+

print(infer.query(
    variables = ['Financial Well Being'],
    evidence = {'Age':'76+', 'Gender':'Female', 'Financial Literacy':'Below Average'}
    )
)

+--------------------------------------+-----------------------------+
| Financial Well Being                 |   phi(Financial Well Being) |
+======================================+=============================+
| Financial Well Being(Low and Below)  |                      0.3925 |
+--------------------------------------+-----------------------------+
| Financial Well Being(Medium)         |                      0.5258 |
+--------------------------------------+-----------------------------+
| Financial Well Being(High and Above) |                      0.0817 |
+--------------------------------------+-----------------------------+

model.check_model()

True

SD2	Female									Male									All
SD3_processed	16-36			37-56			57+			16-36			37-56			57+
SD4_processed	Bachelor and above	High school (12 grades)	Middle school (8 grades) and below	Bachelor and above	High school (12 grades)	Middle school (8 grades) and below	Bachelor and above	High school (12 grades)	Middle school (8 grades) and below	Bachelor and above	High school (12 grades)	Middle school (8 grades) and below	Bachelor and above	High school (12 grades)	Middle school (8 grades) and below	Bachelor and above	High school (12 grades)	Middle school (8 grades) and below
Financial Literacy
0-5	0.020690	0.097222	0.250	0.058065	0.052632	0.615385	0.043478	0.164706	0.46	0.013514	0.052632	0.3	0.030303	0.069892	0.346154	0.058824	0.103175	0.500	0.103523
6-10	0.055172	0.083333	0.250	0.064516	0.152047	0.153846	0.086957	0.270588	0.26	0.040541	0.175439	0.5	0.121212	0.193548	0.230769	0.098039	0.285714	0.325	0.156722
11-15	0.337931	0.319444	0.125	0.277419	0.362573	0.230769	0.260870	0.400000	0.26	0.243243	0.228070	0.1	0.313131	0.301075	0.346154	0.215686	0.333333	0.150	0.302660
16-20	0.441379	0.472222	0.375	0.496774	0.415205	0.000000	0.565217	0.141176	0.02	0.432432	0.473684	0.1	0.404040	0.370968	0.038462	0.372549	0.238095	0.025	0.355859
+20	0.144828	0.027778	0.000	0.103226	0.017544	0.000000	0.043478	0.023529	0.00	0.270270	0.070175	0.0	0.131313	0.064516	0.038462	0.254902	0.039683	0.000	0.081237

Financial Behaviour	0-2					2-4					4+					All
Financial Literacy	0-5	6-10	11-15	16-20	+20	0-5	6-10	11-15	16-20	+20	0-5	6-10	11-15	16-20	+20
Financial Well Being
Very Low	0.085271	0.062857	0.019672	0.013115	0.000000	0.000000	0.023810	0.000000	0.006803	0.000000	0.0	0.0	0.0	0.000000	0.000000	0.024443
Low	0.457364	0.400000	0.370492	0.301639	0.137255	0.200000	0.261905	0.166667	0.122449	0.000000	0.0	0.0	0.1	0.046512	0.000000	0.282531
Medium	0.418605	0.462857	0.544262	0.570492	0.666667	0.666667	0.523810	0.687500	0.585034	0.595745	0.0	1.0	0.7	0.627907	0.533333	0.554277
High	0.038760	0.062857	0.055738	0.108197	0.196078	0.133333	0.190476	0.135417	0.278912	0.319149	0.0	0.0	0.2	0.279070	0.466667	0.127965
Very High	0.000000	0.011429	0.009836	0.006557	0.000000	0.000000	0.000000	0.010417	0.006803	0.085106	0.0	0.0	0.0	0.046512	0.000000	0.010784

Financial Behaviour	0-2					2-4					4+					All
Financial Literacy	0-5	6-10	11-15	16-20	+20	0-5	6-10	11-15	16-20	+20	0-5	6-10	11-15	16-20	+20
Financial Well Being_processed
Low and Below	0.542636	0.462857	0.390164	0.314754	0.137255	0.200000	0.285714	0.166667	0.129252	0.000000	0.0	0.0	0.1	0.046512	0.000000	0.306973
Medium	0.418605	0.462857	0.544262	0.570492	0.666667	0.666667	0.523810	0.687500	0.585034	0.595745	0.0	1.0	0.7	0.627907	0.533333	0.554277
High and Above	0.038760	0.074286	0.065574	0.114754	0.196078	0.133333	0.190476	0.145833	0.285714	0.404255	0.0	0.0	0.2	0.325581	0.466667	0.138749

SD2	Female									Male									All
SD3_processed	16-36			37-56			57+			16-36			37-56			57+
SD4_processed	Bachelor and above	High school (12 grades)	Middle school (8 grades) and below	Bachelor and above	High school (12 grades)	Middle school (8 grades) and below	Bachelor and above	High school (12 grades)	Middle school (8 grades) and below	Bachelor and above	High school (12 grades)	Middle school (8 grades) and below	Bachelor and above	High school (12 grades)	Middle school (8 grades) and below	Bachelor and above	High school (12 grades)	Middle school (8 grades) and below
Financial Literacy_processed
Below Average	0.344828	0.430556	0.625	0.322581	0.48538	0.923077	0.347826	0.764706	0.94	0.243243	0.403509	0.8	0.363636	0.494624	0.846154	0.333333	0.634921	0.975	0.49317
Above Average	0.655172	0.569444	0.375	0.677419	0.51462	0.076923	0.652174	0.235294	0.06	0.756757	0.596491	0.2	0.636364	0.505376	0.153846	0.666667	0.365079	0.025	0.50683

Financial Behaviour	0-2		2-4		4+		All
Financial Literacy_processed	Below Average	Above Average	Below Average	Above Average	Below Average	Above Average
Financial Well Being_processed
Low and Below	0.459410	0.293144	0.201550	0.110092	0.133333	0.03125	0.306973
Medium	0.479705	0.588652	0.658915	0.582569	0.666667	0.62500	0.554277
High and Above	0.060886	0.118203	0.139535	0.307339	0.200000	0.34375	0.138749

Importing Libraries¶

Data¶

Loading the Data¶

Looking into the Columns¶

Sociodemographic Columns¶

Gender¶

Age¶

Educational Attainment¶

Financial Behaviour and Attitude¶

Record Keeping¶

Money Invested¶

Aggregate into Financial Behaviour Column¶

Financial Literacy¶

Question 1¶

Question 2¶

Question 3¶

Question 4¶

Question 5¶

Question 6¶

Question 7¶

Question 8¶

Get the sum of all the columns and then group them into categories¶

Financial Well Being¶

Defining the Network¶

Defining the CPD to the Bayesian Network¶

Gender¶

Age¶

Education¶

Financial Literacy¶

Financial Behaviour¶

Financial Well Being¶

Inferences and Queries¶

	id	NUTS3	NUTS2	SD1	SD1a	SD2	SD3	age	age2	SD4	...	C3	C4	C5	C6	C7	C8	weight	age16_61	FWB
0	2989	IF	Bucuresti-Ilfov	Urban area, with fewer than 30,000 people	Urban	Female	49	40-59 years	45-54 years	High school (12 grades)	...	False	They are equally rich	The same	Multiple business of investments	Bonds	Stocks	0.054678	1	67
1	3346	IF	Bucuresti-Ilfov	Urban area, with fewer than 30,000 people	Urban	Female	32	16-39 years	25-34 years	Bachelor and master education	...	False	They are equally rich	The same	Multiple business of investments	Bonds	Stocks	0.054678	1	75
2	2574	IF	Bucuresti-Ilfov	Urban area, with fewer than 30,000 people	Urban	Female	41	40-59 years	35-44 years	Bachelor and master education	...	False	They are equally rich	The same	Multiple business of investments	Bonds	Stocks	0.054678	1	68
3	3337	IF	Bucuresti-Ilfov	Urban area, with fewer than 30,000 people	Urban	Female	44	40-59 years	35-44 years	High school (12 grades)	...	Don’t know	They are equally rich	The same	Multiple business of investments	Bonds	Stocks	0.054678	1	48
4	3332	IF	Bucuresti-Ilfov	Urban area, with fewer than 30,000 people	Urban	Female	75	60+ years	65+ years	Middle school (8 grades)	...	True	Don’t know	The same	Don’t know	Savings deposit	Don’t know	0.054678	0	53


		Yes	No
I3_1	Mass-media (TV and radio)	1	2
I3_2	Online and printed newspapers	1	2
I3_3	Financial websites and mobile apps	1	2
I3_4	Advice from friends	1	2
I3_5	Personal experience and knowledge	1	2
I3_6	Other sources	1	2

col_0	Probability
SD4_processed
Bachelor and above	0.393242
High school (12 grades)	0.501078
Middle school (8 grades) and below	0.105679

	I1	I1_processed
0	Yes, we keep records, but not all revenues and...	2
1	No, we don’t keep records, but we know how muc...	1
2	Yes, we keep records, but not all revenues and...	2
3	Yes, we keep records, but not all revenues and...	2
4	No, we don’t keep records, but we know how muc...	1
...	...	...
1386	Yes, we keep records, but not all revenues and...	2
1387	No, we don’t keep records, but we know how muc...	1
1388	No, we don’t keep records, but we know how muc...	1
1389	Yes, we keep records of all revenues and all e...	3
1390	No, we don’t keep records, but we know how muc...	1


I2_0	I have not saved or invested
I2_1	Savings deposit
I2_2	Stocks
I2_3	Bonds
I2_4	Real estate
I2_5	Investment funds
I2_6	Life insurance
I2_7	Cryptocurrency
I2_8	I saved and kept money at home

	C1	C1_processed
0	1 in 1,000	2
1	1 in 1,000	2
2	1 in 1,000	2
3	1 in 1,000	2
4	1 in 1,000	2
...	...	...
1386	1 in 1,000	2
1387	1 in 1,000	2
1388	Don’t know	0
1389	Don’t know	0
1390	Don’t know	0


More than LEI 150	1
Exactly LEI 150 lei	2
Less than LEI 150 lei	3
Don’t know	0

	C2	C2_processed
0	Exactly LEI 150	2
1	Exactly LEI 150	2
2	Exactly LEI 150	2
3	Exactly LEI 150	2
4	Exactly LEI 150	2
...	...	...
1386	Exactly LEI 150	2
1387	Exactly LEI 150	2
1388	More than LEI 150	3
1389	Exactly LEI 150	2
1390	Don’t know	0


One business or investment	1
Multiple business of investments	2
Don’t know	0