import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
%matplotlib inline
data = pd.read_csv('noshows.csv')
df = data.copy()
df.head()
| PatientId | AppointmentID | Gender | ScheduledDay | AppointmentDay | Age | Neighbourhood | Scholarship | Hipertension | Diabetes | Alcoholism | Handcap | SMS_received | No-show | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.987250e+13 | 5642903 | F | 2016-04-29T18:38:08Z | 2016-04-29T00:00:00Z | 62 | JARDIM DA PENHA | 0 | 1 | 0 | 0 | 0 | 0 | No |
| 1 | 5.589978e+14 | 5642503 | M | 2016-04-29T16:08:27Z | 2016-04-29T00:00:00Z | 56 | JARDIM DA PENHA | 0 | 0 | 0 | 0 | 0 | 0 | No |
| 2 | 4.262962e+12 | 5642549 | F | 2016-04-29T16:19:04Z | 2016-04-29T00:00:00Z | 62 | MATA DA PRAIA | 0 | 0 | 0 | 0 | 0 | 0 | No |
| 3 | 8.679512e+11 | 5642828 | F | 2016-04-29T17:29:31Z | 2016-04-29T00:00:00Z | 8 | PONTAL DE CAMBURI | 0 | 0 | 0 | 0 | 0 | 0 | No |
| 4 | 8.841186e+12 | 5642494 | F | 2016-04-29T16:07:23Z | 2016-04-29T00:00:00Z | 56 | JARDIM DA PENHA | 0 | 1 | 1 | 0 | 0 | 0 | No |
df.shape
(110527, 14)
df.duplicated().sum()
0
# check null values in df :
pd.DataFrame(df.isnull().sum().sort_values(ascending=False))
| 0 | |
|---|---|
| PatientId | 0 |
| AppointmentID | 0 |
| Gender | 0 |
| ScheduledDay | 0 |
| AppointmentDay | 0 |
| Age | 0 |
| Neighbourhood | 0 |
| Scholarship | 0 |
| Hipertension | 0 |
| Diabetes | 0 |
| Alcoholism | 0 |
| Handcap | 0 |
| SMS_received | 0 |
| No-show | 0 |
# Columns overview & explore data types :
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 110527 entries, 0 to 110526 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PatientId 110527 non-null float64 1 AppointmentID 110527 non-null int64 2 Gender 110527 non-null object 3 ScheduledDay 110527 non-null object 4 AppointmentDay 110527 non-null object 5 Age 110527 non-null int64 6 Neighbourhood 110527 non-null object 7 Scholarship 110527 non-null int64 8 Hipertension 110527 non-null int64 9 Diabetes 110527 non-null int64 10 Alcoholism 110527 non-null int64 11 Handcap 110527 non-null int64 12 SMS_received 110527 non-null int64 13 No-show 110527 non-null object dtypes: float64(1), int64(8), object(5) memory usage: 11.8+ MB
# lets check :
print(len(df['PatientId'].unique()), '..',len(df['AppointmentID'].unique()))
62299 .. 110527
pd.DataFrame(df.loc[:, 'No-show'])
| No-show | |
|---|---|
| 0 | No |
| 1 | No |
| 2 | No |
| 3 | No |
| 4 | No |
| ... | ... |
| 110522 | No |
| 110523 | No |
| 110524 | No |
| 110525 | No |
| 110526 | No |
110527 rows × 1 columns
df.rename(columns = {'No-show': 'Attended_appointment'}, inplace = True)
df['Attended_appointment'] = df['Attended_appointment'].replace(['No'], 1)
df['Attended_appointment'] = df['Attended_appointment'].replace(['Yes'], 0)
pd.DataFrame(df.loc[:, 'Attended_appointment'])
| Attended_appointment | |
|---|---|
| 0 | 1 |
| 1 | 1 |
| 2 | 1 |
| 3 | 1 |
| 4 | 1 |
| ... | ... |
| 110522 | 1 |
| 110523 | 1 |
| 110524 | 1 |
| 110525 | 1 |
| 110526 | 1 |
110527 rows × 1 columns
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'])
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'])
print(df['ScheduledDay'].dtype)
print(df['AppointmentDay'].dtype)
datetime64[ns, UTC] datetime64[ns, UTC]
df['waiting_period'] = (df['AppointmentDay'].dt.date - df['ScheduledDay'].dt.date).dt.days
df.loc[:,['waiting_period']]
| waiting_period | |
|---|---|
| 0 | 0 |
| 1 | 0 |
| 2 | 0 |
| 3 | 0 |
| 4 | 0 |
| ... | ... |
| 110522 | 35 |
| 110523 | 35 |
| 110524 | 41 |
| 110525 | 41 |
| 110526 | 41 |
110527 rows × 1 columns
df['waiting_period'].unique()
array([ 0, 2, 3, 1, 4, 9, 29, 10, 23, 11, 18, 17, 14,
28, 24, 21, 15, 16, 22, 43, 30, 31, 42, 32, 56, 45,
46, 39, 37, 38, 44, 50, 60, 52, 53, 65, 67, 91, 66,
84, 78, 87, 115, 109, 63, 70, 72, 57, 58, 51, 59, 41,
49, 73, 64, 20, 33, 34, 6, 35, 36, 12, 13, 40, 47,
8, 5, 7, 25, 26, 48, 27, 19, 61, 55, 62, 176, 54,
77, 69, 83, 76, 89, 81, 103, 79, 68, 75, 85, 112, -1,
80, 86, 98, 94, 142, 155, 162, 169, 104, 133, 125, 96, 88,
90, 151, 126, 127, 111, 119, 74, 71, 82, 108, 110, 102, 122,
101, 105, 92, 97, 93, 107, 95, -6, 139, 132, 179, 117, 146,
123], dtype=int64)
df[df['waiting_period'] < 0 ].shape[0]
5
df.drop((df[df['waiting_period'] < 0 ]).index, inplace=True)
df[df['waiting_period'] < 0 ]
| PatientId | AppointmentID | Gender | ScheduledDay | AppointmentDay | Age | Neighbourhood | Scholarship | Hipertension | Diabetes | Alcoholism | Handcap | SMS_received | Attended_appointment | waiting_period |
|---|
# to reset index after droping some raws to avoid errors when using some operation like (for_loop)
df.reset_index(drop=True,inplace=True)
df['Scheduled_month']= df['ScheduledDay'].dt.month
df['ScheduledD_day'] = df['ScheduledDay'].dt.day_name()
df['Scheduled_hour'] = df['ScheduledDay'].dt.hour
df['Appointment_month'] = df['AppointmentDay'].dt.month
df['Appointment_day'] = df['AppointmentDay'].dt.day_name()
df['Appointment_hour'] = df['AppointmentDay'].dt.hour
df.drop(['ScheduledDay','AppointmentDay'], axis=1, inplace=True)
# check for all columns that has date :
date_column = ['Scheduled_month', 'ScheduledD_day', 'Scheduled_hour',
'Appointment_month', 'Appointment_day', 'Appointment_hour']
for clm in date_column :
print(f' values in ({clm}) : {df[clm].unique()} \n')
values in (Scheduled_month) : [ 4 3 2 1 5 11 12 6] values in (ScheduledD_day) : ['Friday' 'Wednesday' 'Tuesday' 'Thursday' 'Monday' 'Saturday'] values in (Scheduled_hour) : [18 16 17 8 15 12 14 11 10 9 7 13 19 20 6 21] values in (Appointment_month) : [4 5 6] values in (Appointment_day) : ['Friday' 'Tuesday' 'Monday' 'Wednesday' 'Thursday' 'Saturday'] values in (Appointment_hour) : [0]
# so hours in Appointment_hour have only zero so we don't need it:
df.drop(['Appointment_hour'], axis=1, inplace=True)
# overview :
df.hist(figsize=(10,8));
df['Age'].hist();
sns.boxplot(df['Age']);
df[(df['Age']<0)]
| PatientId | AppointmentID | Gender | Age | Neighbourhood | Scholarship | Hipertension | Diabetes | Alcoholism | Handcap | SMS_received | Attended_appointment | waiting_period | Scheduled_month | ScheduledD_day | Scheduled_hour | Appointment_month | Appointment_day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 99827 | 4.659432e+14 | 5775010 | F | -1 | ROMÃO | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 6 | Monday | 8 | 6 | Monday |
# so we should drop this row :
df.drop(index=99832, inplace=True)
# chek :
df[df['Age']<0]
| PatientId | AppointmentID | Gender | Age | Neighbourhood | Scholarship | Hipertension | Diabetes | Alcoholism | Handcap | SMS_received | Attended_appointment | waiting_period | Scheduled_month | ScheduledD_day | Scheduled_hour | Appointment_month | Appointment_day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 99827 | 4.659432e+14 | 5775010 | F | -1 | ROMÃO | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 6 | Monday | 8 | 6 | Monday |
df.iloc[99831:99833]
| PatientId | AppointmentID | Gender | Age | Neighbourhood | Scholarship | Hipertension | Diabetes | Alcoholism | Handcap | SMS_received | Attended_appointment | waiting_period | Scheduled_month | ScheduledD_day | Scheduled_hour | Appointment_month | Appointment_day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 99831 | 2.664689e+10 | 5768132 | M | 3 | ROMÃO | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 3 | 6 | Friday | 7 | 6 | Monday |
| 99833 | 2.963993e+13 | 5768135 | F | 4 | FORTE SÃO JOÃO | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 3 | 6 | Friday | 7 | 6 | Monday |
# so we should reset index after droping some raws to avoid errors when using some operation like (for_loop):
df.reset_index(drop=True,inplace=True)
df.iloc[99831:99833]
| PatientId | AppointmentID | Gender | Age | Neighbourhood | Scholarship | Hipertension | Diabetes | Alcoholism | Handcap | SMS_received | Attended_appointment | waiting_period | Scheduled_month | ScheduledD_day | Scheduled_hour | Appointment_month | Appointment_day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 99831 | 2.664689e+10 | 5768132 | M | 3 | ROMÃO | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 3 | 6 | Friday | 7 | 6 | Monday |
| 99832 | 2.963993e+13 | 5768135 | F | 4 | FORTE SÃO JOÃO | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 3 | 6 | Friday | 7 | 6 | Monday |
# def load_values_COUNT():
# while True :
# selected_column = input(f'choose : {list(df.columns)}')
# while selected_column not in list(df.columns) and selected_column != 'done' :
# selected_column = input('please, enter correct answer!')
# if selected_column == 'done' :
# break
# else :
# print('-'*50)
# print(f'{selected_column} : \n')
# print((df[selected_column].value_counts()))
# print(f'sum = {(df[selected_column].value_counts().sum())} , rows = {len(df.index)}')
# print(f'unique = {df[selected_column].unique()}')
# print('-'*50,'\n')
def load_values_COUNT(selected_column):
print('-'*50)
print(f'{selected_column} : \n')
print((df[selected_column].value_counts()))
print(f'sum = {(df[selected_column].value_counts().sum())} , rows = {len(df.index)}')
print(f'unique = {df[selected_column].unique()}')
print('-'*50)
load_values_COUNT('Gender')
-------------------------------------------------- Gender : F 71837 M 38684 Name: Gender, dtype: int64 sum = 110521 , rows = 110521 unique = ['F' 'M'] --------------------------------------------------
load_values_COUNT('Scholarship')
-------------------------------------------------- Scholarship : 0 99660 1 10861 Name: Scholarship, dtype: int64 sum = 110521 , rows = 110521 unique = [0 1] --------------------------------------------------
load_values_COUNT('Hipertension')
-------------------------------------------------- Hipertension : 0 88720 1 21801 Name: Hipertension, dtype: int64 sum = 110521 , rows = 110521 unique = [1 0] --------------------------------------------------
load_values_COUNT('Diabetes')
-------------------------------------------------- Diabetes : 0 102578 1 7943 Name: Diabetes, dtype: int64 sum = 110521 , rows = 110521 unique = [0 1] --------------------------------------------------
load_values_COUNT('Alcoholism')
-------------------------------------------------- Alcoholism : 0 107161 1 3360 Name: Alcoholism, dtype: int64 sum = 110521 , rows = 110521 unique = [0 1] --------------------------------------------------
load_values_COUNT('Handcap')
-------------------------------------------------- Handcap : 0 108282 1 2040 2 183 3 13 4 3 Name: Handcap, dtype: int64 sum = 110521 , rows = 110521 unique = [0 1 2 3 4] --------------------------------------------------
load_values_COUNT('SMS_received')
-------------------------------------------------- SMS_received : 0 75040 1 35481 Name: SMS_received, dtype: int64 sum = 110521 , rows = 110521 unique = [0 1] --------------------------------------------------
load_values_COUNT('Attended_appointment')
-------------------------------------------------- Attended_appointment : 1 88208 0 22313 Name: Attended_appointment, dtype: int64 sum = 110521 , rows = 110521 unique = [1 0] --------------------------------------------------
load_values_COUNT('waiting_period')
--------------------------------------------------
waiting_period :
0 38563
2 6725
4 5290
1 5213
7 4906
...
82 1
146 1
123 1
101 1
127 1
Name: waiting_period, Length: 129, dtype: int64
sum = 110521 , rows = 110521
unique = [ 0 2 3 1 4 9 29 10 23 11 18 17 14 28 24 21 15 16
22 43 30 31 42 32 56 45 46 39 37 38 44 50 60 52 53 65
67 91 66 84 78 87 115 109 63 70 72 57 58 51 59 41 49 73
64 20 33 34 6 35 36 12 13 40 47 8 5 7 25 26 48 27
19 61 55 62 176 54 77 69 83 76 89 81 103 79 68 75 85 112
80 86 98 94 142 155 162 169 104 133 125 96 88 90 151 126 127 111
119 74 71 82 108 110 102 122 101 105 92 97 93 107 95 139 132 179
117 146 123]
--------------------------------------------------
load_values_COUNT('Scheduled_month')
-------------------------------------------------- Scheduled_month : 5 67416 4 25338 6 13750 3 3614 2 281 12 61 1 60 11 1 Name: Scheduled_month, dtype: int64 sum = 110521 , rows = 110521 unique = [ 4 3 2 1 5 11 12 6] --------------------------------------------------
load_values_COUNT('ScheduledD_day')
-------------------------------------------------- ScheduledD_day : Tuesday 26167 Wednesday 24259 Monday 23084 Friday 18915 Thursday 18072 Saturday 24 Name: ScheduledD_day, dtype: int64 sum = 110521 , rows = 110521 unique = ['Friday' 'Wednesday' 'Tuesday' 'Thursday' 'Monday' 'Saturday'] --------------------------------------------------
load_values_COUNT('Scheduled_hour')
-------------------------------------------------- Scheduled_hour : 7 19213 8 15349 9 12822 10 11055 14 9126 13 9034 11 8462 15 8079 16 5542 12 5422 17 2909 6 1577 18 1340 19 488 20 100 21 3 Name: Scheduled_hour, dtype: int64 sum = 110521 , rows = 110521 unique = [18 16 17 8 15 12 14 11 10 9 7 13 19 20 6 21] --------------------------------------------------
load_values_COUNT('Appointment_month')
-------------------------------------------------- Appointment_month : 5 80836 6 26450 4 3235 Name: Appointment_month, dtype: int64 sum = 110521 , rows = 110521 unique = [4 5 6] --------------------------------------------------
load_values_COUNT('Appointment_day')
-------------------------------------------------- Appointment_day : Wednesday 25866 Tuesday 25638 Monday 22713 Friday 19019 Thursday 17246 Saturday 39 Name: Appointment_day, dtype: int64 sum = 110521 , rows = 110521 unique = ['Friday' 'Tuesday' 'Monday' 'Wednesday' 'Thursday' 'Saturday'] --------------------------------------------------
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 110521 entries, 0 to 110520 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PatientId 110521 non-null float64 1 AppointmentID 110521 non-null int64 2 Gender 110521 non-null object 3 Age 110521 non-null int64 4 Neighbourhood 110521 non-null object 5 Scholarship 110521 non-null int64 6 Hipertension 110521 non-null int64 7 Diabetes 110521 non-null int64 8 Alcoholism 110521 non-null int64 9 Handcap 110521 non-null int64 10 SMS_received 110521 non-null int64 11 Attended_appointment 110521 non-null int64 12 waiting_period 110521 non-null int64 13 Scheduled_month 110521 non-null int64 14 ScheduledD_day 110521 non-null object 15 Scheduled_hour 110521 non-null int64 16 Appointment_month 110521 non-null int64 17 Appointment_day 110521 non-null object dtypes: float64(1), int64(13), object(4) memory usage: 15.2+ MB
# function for ploting (sns.countplot) with percentage :
def countplot_ratio(x,data,hue=None,ax=None):
ax=sns.countplot(x,data=data,hue=hue,ax=ax)
ax.set_xticklabels(ax.get_xticklabels())
ax.set_title(x + " Distributions")
total = float(len(data))
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,height + 3,
'{:.2f}%'.format((height/total)*100),fontsize=12, weight = 'bold',ha="center");
countplot_ratio(x=df.columns[11], hue=None, data=df)
attend_df = df[df['Attended_appointment'] == 1]
notattend_df = df[df['Attended_appointment'] == 0]
# number of repeated values:
df['PatientId'].duplicated().sum()
48223
df['waiting_period'].unique()
array([ 0, 2, 3, 1, 4, 9, 29, 10, 23, 11, 18, 17, 14,
28, 24, 21, 15, 16, 22, 43, 30, 31, 42, 32, 56, 45,
46, 39, 37, 38, 44, 50, 60, 52, 53, 65, 67, 91, 66,
84, 78, 87, 115, 109, 63, 70, 72, 57, 58, 51, 59, 41,
49, 73, 64, 20, 33, 34, 6, 35, 36, 12, 13, 40, 47,
8, 5, 7, 25, 26, 48, 27, 19, 61, 55, 62, 176, 54,
77, 69, 83, 76, 89, 81, 103, 79, 68, 75, 85, 112, 80,
86, 98, 94, 142, 155, 162, 169, 104, 133, 125, 96, 88, 90,
151, 126, 127, 111, 119, 74, 71, 82, 108, 110, 102, 122, 101,
105, 92, 97, 93, 107, 95, 139, 132, 179, 117, 146, 123],
dtype=int64)
waiting_df = df[df['waiting_period'] < 7]
plt.figure(figsize=(10,8))
countplot_ratio(x=waiting_df .columns[12], hue='Attended_appointment', data = waiting_df )
print(f'average waiting_period of patient who didnot attend was about: ({int(notattend_df.waiting_period.mean())}) days')
print(f'most waiting_period of patient who didnot attend was about: ({notattend_df.waiting_period.mode()[0]}) days')
print(f'mdian waiting_period of patient who didnot attend was about: ({int(notattend_df.waiting_period.median())}) days')
average waiting_period of patient who didnot attend was about: (15) days most waiting_period of patient who didnot attend was about: (0) days mdian waiting_period of patient who didnot attend was about: (11) days
print(f'average waiting_period of patient who attended was about: ({int(attend_df.waiting_period.mean())}) days')
print(f'most waiting_period of patient who attended was about: ({attend_df.waiting_period.mode()[0]}) days')
print(f'mdian waiting_period of patient who attended was about: ({int(attend_df.waiting_period.median())}) days')
average waiting_period of patient who attended was about: (8) days most waiting_period of patient who attended was about: (0) days mdian waiting_period of patient who attended was about: (2) days
countplot_ratio(x=df.columns[2], hue='Attended_appointment', data=df)
countplot_ratio(x=df.columns[2], hue=None, data=df)
attend_df['Age'].hist( bins=20, label='attend')
notattend_df['Age'].hist( bins=20, label='not attend')
plt.title("Histogram of Ages")
plt.xlabel("Age")
plt.ylabel("count")
plt.legend();
plt.figure(figsize=(16,4))
plt.xticks(rotation=90)
ax = sns.countplot(x=df['Age'] , hue = df['Attended_appointment'] , order=df['Age'].value_counts().index)
ax.set_title("attend/not attend of Appointments by Age")
plt.show()
print(f'average age of patient who didnot attend was about: ({int(notattend_df.Age.mean())}) years old')
print(f'most age of patient who didnot attend was about: ({notattend_df.Age.mode()[0]}) years old about: ({notattend_df.Age.value_counts()[0]}) patients')
print(f'mdian age of patient who didnot attend was about: ({int(notattend_df.Age.median())}) years old ')
average age of patient who didnot attend was about: (34) years old most age of patient who didnot attend was about: (0) years old about: (638) patients mdian age of patient who didnot attend was about: (33) years old
print(f'average age of patient who attended was about: ({int(attend_df.Age.mean())}) years old')
print(f'most age of patient who attended was about: ({attend_df.Age.mode()[0]}) years old about: ({attend_df.Age.value_counts()[0]}) patients')
print(f'mdian age of patient who attended was about: ({int(attend_df.Age.median())}) years old')
average age of patient who attended was about: (37) years old most age of patient who attended was about: (0) years old about: (2900) patients mdian age of patient who attended was about: (38) years old
plt.figure(figsize=(16,4))
plt.xticks(rotation=90)
ax = sns.countplot(x=df['Neighbourhood'] , hue = df['Attended_appointment'] , order=df['Neighbourhood'].value_counts().index)
ax.set_title("attend/not attend of Appointments by Neighbourhood")
plt.show()
print(f'the most patient who attended was from: ({attend_df.Neighbourhood.mode()[0]}) about: ({attend_df.Neighbourhood.value_counts()[0]}) times' )
print(f'the most patient who didnot attend was from: ({notattend_df.Neighbourhood.mode()[0]})about: ({notattend_df.Neighbourhood.value_counts()[0]}) times ')
the most patient who attended was from: (JARDIM CAMBURI) about: (6252) times the most patient who didnot attend was from: (JARDIM CAMBURI)about: (1465) times
attend_df['SMS_received'].hist( bins=20, label='attend')
notattend_df['SMS_received'].hist( bins=20, label='not attend')
plt.legend();
countplot_ratio(x=df.columns[10], hue=None, data=df)
countplot_ratio(x=df.columns[10], hue='Attended_appointment', data=df)
# Hipertension :
countplot_ratio(x=df .columns[6], hue=None, data=df )
countplot_ratio(x=df .columns[6], hue='Attended_appointment', data=df )
Hipertension_df =df[df['Hipertension'] == 1]
countplot_ratio(x=Hipertension_df .columns[6], hue='Attended_appointment', data=Hipertension_df )
# Diabetes :
countplot_ratio(x=df.columns[7], hue=None, data=df)
countplot_ratio(x=df.columns[7], hue='Attended_appointment', data=df)
Diabetes_df =df[df['Diabetes'] == 1]
countplot_ratio(x=Diabetes_df .columns[7], hue='Attended_appointment', data=Diabetes_df )
# Alcoholism :
countplot_ratio(x=df.columns[8], hue=None, data=df)
countplot_ratio(x=df.columns[8], hue='Attended_appointment', data=df)
Alcoholism_df =df[df['Alcoholism'] == 1]
countplot_ratio(x=Alcoholism_df .columns[8], hue='Attended_appointment', data=Alcoholism_df )
# Scholarship :
countplot_ratio(x=df.columns[5], hue=None, data=df)
countplot_ratio(x=df.columns[5], hue='Attended_appointment', data=df)
Scholarship_df = df[df['Scholarship'] == 1]
countplot_ratio(x=Scholarship_df.columns[5], hue='Attended_appointment', data=Scholarship_df)
# Scholarship :
countplot_ratio(x=df.columns[9], hue=None, data=df)
plt.figure(figsize=(10,5))
countplot_ratio(x=df.columns[9], hue='Attended_appointment', data=df)
Handcap_df = df[df['Handcap'] == 1]
countplot_ratio(x=Handcap_df.columns[9], hue='Attended_appointment', data=Handcap_df)
# Let's, now explore columns that have date :
date_column
['Scheduled_month', 'ScheduledD_day', 'Scheduled_hour', 'Appointment_month', 'Appointment_day', 'Appointment_hour']
countplot_ratio(x=df.columns[13], hue=None, data=df)
# for day :
countplot_ratio(x=df.columns[14], hue=None, data=df)
# for hour :
plt.figure(figsize=(11,5))
countplot_ratio(x=df.columns[15], hue=None, data=df)
plt.figure(figsize=(20,15))
sns.heatmap(df.corr(),annot=True);