import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
%matplotlib inline


data = pd.read_csv('noshows.csv')
df = data.copy()


df.head()


df.shape

(110527, 14)


df.duplicated().sum()

0


# check null values in df : 
pd.DataFrame(df.isnull().sum().sort_values(ascending=False))


# Columns overview & explore data types :
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


# lets check :
print(len(df['PatientId'].unique()), '..',len(df['AppointmentID'].unique()))

62299 .. 110527


pd.DataFrame(df.loc[:, 'No-show'])


df.rename(columns = {'No-show': 'Attended_appointment'}, inplace = True)


df['Attended_appointment'] = df['Attended_appointment'].replace(['No'], 1)
df['Attended_appointment'] = df['Attended_appointment'].replace(['Yes'], 0)


pd.DataFrame(df.loc[:, 'Attended_appointment'])


df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'])
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'])


print(df['ScheduledDay'].dtype)
print(df['AppointmentDay'].dtype)

datetime64[ns, UTC]
datetime64[ns, UTC]


df['waiting_period'] = (df['AppointmentDay'].dt.date - df['ScheduledDay'].dt.date).dt.days
df.loc[:,['waiting_period']]


df['waiting_period'].unique()

array([  0,   2,   3,   1,   4,   9,  29,  10,  23,  11,  18,  17,  14,
        28,  24,  21,  15,  16,  22,  43,  30,  31,  42,  32,  56,  45,
        46,  39,  37,  38,  44,  50,  60,  52,  53,  65,  67,  91,  66,
        84,  78,  87, 115, 109,  63,  70,  72,  57,  58,  51,  59,  41,
        49,  73,  64,  20,  33,  34,   6,  35,  36,  12,  13,  40,  47,
         8,   5,   7,  25,  26,  48,  27,  19,  61,  55,  62, 176,  54,
        77,  69,  83,  76,  89,  81, 103,  79,  68,  75,  85, 112,  -1,
        80,  86,  98,  94, 142, 155, 162, 169, 104, 133, 125,  96,  88,
        90, 151, 126, 127, 111, 119,  74,  71,  82, 108, 110, 102, 122,
       101, 105,  92,  97,  93, 107,  95,  -6, 139, 132, 179, 117, 146,
       123], dtype=int64)


df[df['waiting_period'] < 0 ].shape[0]

5


df.drop((df[df['waiting_period'] < 0 ]).index, inplace=True)


df[df['waiting_period'] < 0 ]


# to reset index after droping some raws to avoid errors when using some operation like (for_loop)
df.reset_index(drop=True,inplace=True)


df['Scheduled_month']= df['ScheduledDay'].dt.month
df['ScheduledD_day'] = df['ScheduledDay'].dt.day_name()
df['Scheduled_hour'] = df['ScheduledDay'].dt.hour
df['Appointment_month'] = df['AppointmentDay'].dt.month
df['Appointment_day'] = df['AppointmentDay'].dt.day_name()
df['Appointment_hour'] = df['AppointmentDay'].dt.hour


df.drop(['ScheduledDay','AppointmentDay'], axis=1, inplace=True)


# check for all columns that has date :
date_column = ['Scheduled_month', 'ScheduledD_day', 'Scheduled_hour', 
               'Appointment_month', 'Appointment_day', 'Appointment_hour']
for clm in date_column :
    print(f' values in ({clm}) : {df[clm].unique()} \n')

 values in (Scheduled_month) : [ 4  3  2  1  5 11 12  6] 

 values in (ScheduledD_day) : ['Friday' 'Wednesday' 'Tuesday' 'Thursday' 'Monday' 'Saturday'] 

 values in (Scheduled_hour) : [18 16 17  8 15 12 14 11 10  9  7 13 19 20  6 21] 

 values in (Appointment_month) : [4 5 6] 

 values in (Appointment_day) : ['Friday' 'Tuesday' 'Monday' 'Wednesday' 'Thursday' 'Saturday'] 

 values in (Appointment_hour) : [0]


# so hours in Appointment_hour have only zero so we don't need it:
df.drop(['Appointment_hour'], axis=1, inplace=True)


# overview :
df.hist(figsize=(10,8));


df['Age'].hist();


sns.boxplot(df['Age']);


df[(df['Age']<0)]


# so we should drop this row : 
df.drop(index=99832, inplace=True)


# chek :
df[df['Age']<0]


df.iloc[99831:99833]


# so we should reset index after droping some raws to avoid errors when using some operation like (for_loop):
df.reset_index(drop=True,inplace=True)
df.iloc[99831:99833]


# def load_values_COUNT():
#     while True :
#         selected_column = input(f'choose : {list(df.columns)}')
#         while selected_column not in  list(df.columns) and selected_column != 'done' : 
#             selected_column = input('please, enter correct answer!')
#         if selected_column == 'done' : 
#             break
#         else :
#             print('-'*50)
#             print(f'{selected_column} : \n')
#             print((df[selected_column].value_counts()))
#             print(f'sum = {(df[selected_column].value_counts().sum())} , rows = {len(df.index)}')
#             print(f'unique = {df[selected_column].unique()}')
#             print('-'*50,'\n')
def load_values_COUNT(selected_column):
    print('-'*50)
    print(f'{selected_column} : \n')
    print((df[selected_column].value_counts()))
    print(f'sum = {(df[selected_column].value_counts().sum())} , rows = {len(df.index)}')
    print(f'unique = {df[selected_column].unique()}')
    print('-'*50)
load_values_COUNT('Gender')

--------------------------------------------------
Gender : 

F    71837
M    38684
Name: Gender, dtype: int64
sum = 110521 , rows = 110521
unique = ['F' 'M']
--------------------------------------------------


load_values_COUNT('Scholarship')

--------------------------------------------------
Scholarship : 

0    99660
1    10861
Name: Scholarship, dtype: int64
sum = 110521 , rows = 110521
unique = [0 1]
--------------------------------------------------


load_values_COUNT('Hipertension')

--------------------------------------------------
Hipertension : 

0    88720
1    21801
Name: Hipertension, dtype: int64
sum = 110521 , rows = 110521
unique = [1 0]
--------------------------------------------------


load_values_COUNT('Diabetes')

--------------------------------------------------
Diabetes : 

0    102578
1      7943
Name: Diabetes, dtype: int64
sum = 110521 , rows = 110521
unique = [0 1]
--------------------------------------------------


load_values_COUNT('Alcoholism')

--------------------------------------------------
Alcoholism : 

0    107161
1      3360
Name: Alcoholism, dtype: int64
sum = 110521 , rows = 110521
unique = [0 1]
--------------------------------------------------


load_values_COUNT('Handcap')

--------------------------------------------------
Handcap : 

0    108282
1      2040
2       183
3        13
4         3
Name: Handcap, dtype: int64
sum = 110521 , rows = 110521
unique = [0 1 2 3 4]
--------------------------------------------------


load_values_COUNT('SMS_received')

--------------------------------------------------
SMS_received : 

0    75040
1    35481
Name: SMS_received, dtype: int64
sum = 110521 , rows = 110521
unique = [0 1]
--------------------------------------------------


load_values_COUNT('Attended_appointment')

--------------------------------------------------
Attended_appointment : 

1    88208
0    22313
Name: Attended_appointment, dtype: int64
sum = 110521 , rows = 110521
unique = [1 0]
--------------------------------------------------


load_values_COUNT('waiting_period')

--------------------------------------------------
waiting_period : 

0      38563
2       6725
4       5290
1       5213
7       4906
       ...  
82         1
146        1
123        1
101        1
127        1
Name: waiting_period, Length: 129, dtype: int64
sum = 110521 , rows = 110521
unique = [  0   2   3   1   4   9  29  10  23  11  18  17  14  28  24  21  15  16
  22  43  30  31  42  32  56  45  46  39  37  38  44  50  60  52  53  65
  67  91  66  84  78  87 115 109  63  70  72  57  58  51  59  41  49  73
  64  20  33  34   6  35  36  12  13  40  47   8   5   7  25  26  48  27
  19  61  55  62 176  54  77  69  83  76  89  81 103  79  68  75  85 112
  80  86  98  94 142 155 162 169 104 133 125  96  88  90 151 126 127 111
 119  74  71  82 108 110 102 122 101 105  92  97  93 107  95 139 132 179
 117 146 123]
--------------------------------------------------


load_values_COUNT('Scheduled_month')

--------------------------------------------------
Scheduled_month : 

5     67416
4     25338
6     13750
3      3614
2       281
12       61
1        60
11        1
Name: Scheduled_month, dtype: int64
sum = 110521 , rows = 110521
unique = [ 4  3  2  1  5 11 12  6]
--------------------------------------------------


load_values_COUNT('ScheduledD_day')

--------------------------------------------------
ScheduledD_day : 

Tuesday      26167
Wednesday    24259
Monday       23084
Friday       18915
Thursday     18072
Saturday        24
Name: ScheduledD_day, dtype: int64
sum = 110521 , rows = 110521
unique = ['Friday' 'Wednesday' 'Tuesday' 'Thursday' 'Monday' 'Saturday']
--------------------------------------------------


load_values_COUNT('Scheduled_hour')

--------------------------------------------------
Scheduled_hour : 

7     19213
8     15349
9     12822
10    11055
14     9126
13     9034
11     8462
15     8079
16     5542
12     5422
17     2909
6      1577
18     1340
19      488
20      100
21        3
Name: Scheduled_hour, dtype: int64
sum = 110521 , rows = 110521
unique = [18 16 17  8 15 12 14 11 10  9  7 13 19 20  6 21]
--------------------------------------------------


load_values_COUNT('Appointment_month')

--------------------------------------------------
Appointment_month : 

5    80836
6    26450
4     3235
Name: Appointment_month, dtype: int64
sum = 110521 , rows = 110521
unique = [4 5 6]
--------------------------------------------------


load_values_COUNT('Appointment_day')

--------------------------------------------------
Appointment_day : 

Wednesday    25866
Tuesday      25638
Monday       22713
Friday       19019
Thursday     17246
Saturday        39
Name: Appointment_day, dtype: int64
sum = 110521 , rows = 110521
unique = ['Friday' 'Tuesday' 'Monday' 'Wednesday' 'Thursday' 'Saturday']
--------------------------------------------------


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110521 entries, 0 to 110520
Data columns (total 18 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   PatientId             110521 non-null  float64
 1   AppointmentID         110521 non-null  int64  
 2   Gender                110521 non-null  object 
 3   Age                   110521 non-null  int64  
 4   Neighbourhood         110521 non-null  object 
 5   Scholarship           110521 non-null  int64  
 6   Hipertension          110521 non-null  int64  
 7   Diabetes              110521 non-null  int64  
 8   Alcoholism            110521 non-null  int64  
 9   Handcap               110521 non-null  int64  
 10  SMS_received          110521 non-null  int64  
 11  Attended_appointment  110521 non-null  int64  
 12  waiting_period        110521 non-null  int64  
 13  Scheduled_month       110521 non-null  int64  
 14  ScheduledD_day        110521 non-null  object 
 15  Scheduled_hour        110521 non-null  int64  
 16  Appointment_month     110521 non-null  int64  
 17  Appointment_day       110521 non-null  object 
dtypes: float64(1), int64(13), object(4)
memory usage: 15.2+ MB


# function for ploting (sns.countplot) with percentage :
def countplot_ratio(x,data,hue=None,ax=None):

    ax=sns.countplot(x,data=data,hue=hue,ax=ax)
    ax.set_xticklabels(ax.get_xticklabels())
    ax.set_title(x + " Distributions")
  
    total = float(len(data))
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,height + 3,
                '{:.2f}%'.format((height/total)*100),fontsize=12, weight = 'bold',ha="center");


countplot_ratio(x=df.columns[11], hue=None, data=df)


attend_df = df[df['Attended_appointment'] == 1]
notattend_df = df[df['Attended_appointment'] == 0]


# number of repeated values:
df['PatientId'].duplicated().sum()

48223


df['waiting_period'].unique()

array([  0,   2,   3,   1,   4,   9,  29,  10,  23,  11,  18,  17,  14,
        28,  24,  21,  15,  16,  22,  43,  30,  31,  42,  32,  56,  45,
        46,  39,  37,  38,  44,  50,  60,  52,  53,  65,  67,  91,  66,
        84,  78,  87, 115, 109,  63,  70,  72,  57,  58,  51,  59,  41,
        49,  73,  64,  20,  33,  34,   6,  35,  36,  12,  13,  40,  47,
         8,   5,   7,  25,  26,  48,  27,  19,  61,  55,  62, 176,  54,
        77,  69,  83,  76,  89,  81, 103,  79,  68,  75,  85, 112,  80,
        86,  98,  94, 142, 155, 162, 169, 104, 133, 125,  96,  88,  90,
       151, 126, 127, 111, 119,  74,  71,  82, 108, 110, 102, 122, 101,
       105,  92,  97,  93, 107,  95, 139, 132, 179, 117, 146, 123],
      dtype=int64)


waiting_df = df[df['waiting_period'] < 7]
plt.figure(figsize=(10,8))
countplot_ratio(x=waiting_df .columns[12], hue='Attended_appointment', data = waiting_df )


print(f'average waiting_period of patient who didnot attend was about: ({int(notattend_df.waiting_period.mean())}) days')
print(f'most waiting_period of patient who didnot attend was about: ({notattend_df.waiting_period.mode()[0]}) days')
print(f'mdian waiting_period of patient who didnot attend was about: ({int(notattend_df.waiting_period.median())}) days')

average waiting_period of patient who didnot attend was about: (15) days
most waiting_period of patient who didnot attend was about: (0) days
mdian waiting_period of patient who didnot attend was about: (11) days


print(f'average waiting_period of patient who attended was about: ({int(attend_df.waiting_period.mean())}) days')
print(f'most waiting_period of patient who attended was about: ({attend_df.waiting_period.mode()[0]}) days')
print(f'mdian waiting_period of patient who attended was about: ({int(attend_df.waiting_period.median())}) days')

average waiting_period of patient who attended was about: (8) days
most waiting_period of patient who attended was about: (0) days
mdian waiting_period of patient who attended was about: (2) days


countplot_ratio(x=df.columns[2], hue='Attended_appointment', data=df)


countplot_ratio(x=df.columns[2], hue=None, data=df)


attend_df['Age'].hist( bins=20, label='attend')
notattend_df['Age'].hist( bins=20, label='not attend')
plt.title("Histogram of Ages")
plt.xlabel("Age")
plt.ylabel("count")
plt.legend();


plt.figure(figsize=(16,4))
plt.xticks(rotation=90)
ax = sns.countplot(x=df['Age'] , hue = df['Attended_appointment'] , order=df['Age'].value_counts().index)
ax.set_title("attend/not attend of Appointments by Age")
plt.show()


print(f'average age of patient who didnot attend was about: ({int(notattend_df.Age.mean())}) years old')
print(f'most age of patient who didnot attend was about: ({notattend_df.Age.mode()[0]}) years old about: ({notattend_df.Age.value_counts()[0]}) patients')
print(f'mdian age of patient who didnot attend was about: ({int(notattend_df.Age.median())}) years old ')

average age of patient who didnot attend was about: (34) years old
most age of patient who didnot attend was about: (0) years old about: (638) patients
mdian age of patient who didnot attend was about: (33) years old


print(f'average age of patient who attended was about: ({int(attend_df.Age.mean())}) years old')
print(f'most age of patient who attended was about: ({attend_df.Age.mode()[0]}) years old  about: ({attend_df.Age.value_counts()[0]}) patients')
print(f'mdian age of patient who attended was about: ({int(attend_df.Age.median())}) years old')

average age of patient who attended was about: (37) years old
most age of patient who attended was about: (0) years old  about: (2900) patients
mdian age of patient who attended was about: (38) years old


plt.figure(figsize=(16,4))
plt.xticks(rotation=90)
ax = sns.countplot(x=df['Neighbourhood'] , hue = df['Attended_appointment'] , order=df['Neighbourhood'].value_counts().index)
ax.set_title("attend/not attend of Appointments by Neighbourhood")
plt.show()


print(f'the most patient who attended was from: ({attend_df.Neighbourhood.mode()[0]}) about: ({attend_df.Neighbourhood.value_counts()[0]}) times' )
print(f'the most patient who didnot attend was from: ({notattend_df.Neighbourhood.mode()[0]})about: ({notattend_df.Neighbourhood.value_counts()[0]}) times ')

the most patient who attended was from: (JARDIM CAMBURI) about: (6252) times
the most patient who didnot attend was from: (JARDIM CAMBURI)about: (1465) times


attend_df['SMS_received'].hist( bins=20, label='attend')
notattend_df['SMS_received'].hist( bins=20, label='not attend')
plt.legend();


countplot_ratio(x=df.columns[10], hue=None, data=df)


countplot_ratio(x=df.columns[10], hue='Attended_appointment', data=df)


# Hipertension :
countplot_ratio(x=df .columns[6], hue=None, data=df )


countplot_ratio(x=df .columns[6], hue='Attended_appointment', data=df )


Hipertension_df =df[df['Hipertension'] == 1]
countplot_ratio(x=Hipertension_df .columns[6], hue='Attended_appointment', data=Hipertension_df )


# Diabetes :
countplot_ratio(x=df.columns[7], hue=None, data=df)


countplot_ratio(x=df.columns[7], hue='Attended_appointment', data=df)


Diabetes_df =df[df['Diabetes'] == 1]
countplot_ratio(x=Diabetes_df .columns[7], hue='Attended_appointment', data=Diabetes_df )


# Alcoholism :
countplot_ratio(x=df.columns[8], hue=None, data=df)


countplot_ratio(x=df.columns[8], hue='Attended_appointment', data=df)


Alcoholism_df =df[df['Alcoholism'] == 1]
countplot_ratio(x=Alcoholism_df .columns[8], hue='Attended_appointment', data=Alcoholism_df )


# Scholarship :
countplot_ratio(x=df.columns[5], hue=None, data=df)


countplot_ratio(x=df.columns[5], hue='Attended_appointment', data=df)


Scholarship_df = df[df['Scholarship'] == 1]
countplot_ratio(x=Scholarship_df.columns[5], hue='Attended_appointment', data=Scholarship_df)


# Scholarship :
countplot_ratio(x=df.columns[9], hue=None, data=df)


plt.figure(figsize=(10,5))
countplot_ratio(x=df.columns[9], hue='Attended_appointment', data=df)


Handcap_df = df[df['Handcap'] == 1]
countplot_ratio(x=Handcap_df.columns[9], hue='Attended_appointment', data=Handcap_df)


# Let's, now explore columns that have date :
date_column

['Scheduled_month',
 'ScheduledD_day',
 'Scheduled_hour',
 'Appointment_month',
 'Appointment_day',
 'Appointment_hour']


countplot_ratio(x=df.columns[13], hue=None, data=df)


# for day : 
countplot_ratio(x=df.columns[14], hue=None, data=df)


# for hour : 
plt.figure(figsize=(11,5))
countplot_ratio(x=df.columns[15], hue=None, data=df)


plt.figure(figsize=(20,15))
sns.heatmap(df.corr(),annot=True);

	PatientId	AppointmentID	Gender	ScheduledDay	AppointmentDay	Age	Neighbourhood	Hipertension	Diabetes	No-show
0	2.987250e+13	5642903	F	2016-04-29T18:38:08Z	2016-04-29T00:00:00Z	62	JARDIM DA PENHA	1	0	No
1	5.589978e+14	5642503	M	2016-04-29T16:08:27Z	2016-04-29T00:00:00Z	56	JARDIM DA PENHA	0	0	No
2	4.262962e+12	5642549	F	2016-04-29T16:19:04Z	2016-04-29T00:00:00Z	62	MATA DA PRAIA	0	0	No
3	8.679512e+11	5642828	F	2016-04-29T17:29:31Z	2016-04-29T00:00:00Z	8	PONTAL DE CAMBURI	0	0	No
4	8.841186e+12	5642494	F	2016-04-29T16:07:23Z	2016-04-29T00:00:00Z	56	JARDIM DA PENHA	1	1	No

Medical Appointment No Shows Dataset Exploratory Data Analysis (EDA)¶

Exploratory Data Analysis¶

Research Question 1 (What's the percentage of patient that didn't attend their appointment ?)¶

Research Question 2 ( Does 'waiting_period' Affect the patient to attend his appointment ?)¶

Research Question 3 ( Who is more committed to attending the appointment, male or female ?)¶

Research Question 4 ( Which ages are the most missed of her appointment and which are the most attended ?)¶

Research Question 5 ( Which residential areas are most committed to attending the appointment and which are the least ?)¶

Research Question 6 ( Does Receiving SMS Affect the patient to attend his appointment ?)¶

Research Question 7 ( How were the diseases a factor affecting the prevention of the patient from attending his appointment?)¶

Research Question 8 ( Does having a Scholarship Affect the patient to attend his appointment ?)¶

Research Question 9 (What is the effect of 'Handcap' on the patient's attendance of the appointment)¶

Research Question 10 (What are the most frequent times for appointments ? )¶

	No-show
0	No
1	No
2	No
3	No
4	No
...	...
110522	No
110523	No
110524	No
110525	No
110526	No

	PatientId	AppointmentID	Gender	Age	Neighbourhood	Scholarship	Hipertension	Diabetes	Alcoholism	Handcap	SMS_received	Attended_appointment	waiting_period	Scheduled_month	ScheduledD_day	Scheduled_hour	Appointment_month	Appointment_day
99831	2.664689e+10	5768132	M	3	ROMÃO	0	0	0	0	0	1	1	3	6	Friday	7	6	Monday
99833	2.963993e+13	5768135	F	4	FORTE SÃO JOÃO	0	0	0	0	0	1	1	3	6	Friday	7	6	Monday