# General imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from scipy import stats
from imblearn.over_sampling import RandomOverSampler
from sklearn.cluster import KMeans

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


# Read clean data exported from DB
df = pd.read_csv('Training/TrainingFile_FromDB/InputFile.csv')
df.shape

(3972, 30)


# Check head
df.head(10)


# Define function to count missing values represented by '?'
def calc_missing(df):
    print(f'Column \t No. of missing')
    for col in df.columns:
        count = df[col][df[col]=='?'].count()
        if count != 0:
            print(f'{col} \t {count}')


# Count missing
calc_missing(df)

Column 	 No. of missing
age 	 1
sex 	 157
TSH 	 390
T3 	 812
TT4 	 247
T4U 	 409
FTI 	 407
TBG 	 3972


# Define function
def replace_missing(df):
    for col in df.columns:
        count = df[col][df[col]=='?'].count()
        if count != 0:
            df[col] = df[col].replace('?', np.nan)


# Replace ? with Nan
replace_missing(df)


# Check missing
df.isnull().sum()

age                             1
sex                           157
on_thyroxine                    0
query_on_thyroxine              0
on_antithyroid_medication       0
sick                            0
pregnant                        0
thyroid_surgery                 0
I131_treatment                  0
query_hypothyroid               0
query_hyperthyroid              0
lithium                         0
goitre                          0
tumor                           0
hypopituitary                   0
psych                           0
TSH_measured                    0
TSH                           390
T3_measured                     0
T3                            812
TT4_measured                    0
TT4                           247
T4U_measured                    0
T4U                           409
FTI_measured                    0
FTI                           407
TBG_measured                    0
TBG                          3972
referral_source                 0
Class                           0
dtype: int64


# Look at columns with duplicate information
df.iloc[:, -13:-2].head()


# Identify columns
cols_to_drop = [col for col in df.columns if '_measured' in col]
cols_to_drop

['TSH_measured',
 'T3_measured',
 'TT4_measured',
 'T4U_measured',
 'FTI_measured',
 'TBG_measured']


df['TBG'].isna().sum()

3972


# Identify columns that do not have any variation in data as they do not add any value
for col in df.columns:
    if df[col].isna().sum() == df.shape[0]:
        cols_to_drop.append(col)


cols_to_drop

['TSH_measured',
 'T3_measured',
 'TT4_measured',
 'T4U_measured',
 'FTI_measured',
 'TBG_measured',
 'TBG']


# Drop columns
df.drop(columns=cols_to_drop, axis=1, inplace=True)
df.shape

(3972, 23)


# Check data
df.describe()


# Check data type and categories
print(df['sex'].dtype)
print(df['sex'].value_counts())

object
F    2608
M    1207
Name: sex, dtype: int64


# Convert to numerical
df['sex'] = df['sex'].map({'F':0, 'M':1})


# Check data type
df['sex'].dtype

dtype('float64')


for col in df.columns:
    print(f'{col} \t {len(df[col].unique())}')

age 	 94
sex 	 3
on_thyroxine 	 2
query_on_thyroxine 	 2
on_antithyroid_medication 	 2
sick 	 2
pregnant 	 2
thyroid_surgery 	 2
I131_treatment 	 2
query_hypothyroid 	 2
query_hyperthyroid 	 2
lithium 	 2
goitre 	 2
tumor 	 2
hypopituitary 	 2
psych 	 2
TSH 	 288
T3 	 70
TT4 	 242
T4U 	 147
FTI 	 235
referral_source 	 5
Class 	 4


# Map categories
for col in df.columns:
    if len(df[col].unique())==2:
        df[col] = df[col].map({'f':0, 't':1})


# Check head
df.head()


# Check categories
df['referral_source'].unique()

array(['other', 'SVHC', 'SVI', 'STMW', 'SVHD'], dtype=object)


# Create dummy variables
# drop_first=True to avoid dummy variable trap
df = pd.get_dummies(data=df, columns=['referral_source'], drop_first=True)


# Check head
df.head(2)


# Check categories
df['Class'].unique()

array(['negative', 'compensated_hypothyroid', 'primary_hypothyroid',
       'secondary_hypothyroid'], dtype=object)


# Encode
encoder = LabelEncoder()
df['Class'] = encoder.fit_transform(df['Class'])


# Check categories
df['Class'].unique()

array([1, 0, 2, 3])


# Check missing
df.isnull().sum()

age                            1
sex                          157
on_thyroxine                   0
query_on_thyroxine             0
on_antithyroid_medication      0
sick                           0
pregnant                       0
thyroid_surgery                0
I131_treatment                 0
query_hypothyroid              0
query_hyperthyroid             0
lithium                        0
goitre                         0
tumor                          0
hypopituitary                  0
psych                          0
TSH                          390
T3                           812
TT4                          247
T4U                          409
FTI                          407
Class                          0
referral_source_SVHC           0
referral_source_SVHD           0
referral_source_SVI            0
referral_source_other          0
dtype: int64


# Impute data
imputer = KNNImputer(n_neighbors=3, weights='uniform', missing_values=np.nan)
new_array = imputer.fit_transform(df)


# Convert array to df
new_df = pd.DataFrame(np.round(new_array), columns=df.columns)
new_df.head()


# Check missing
new_df.isnull().sum()

age                          0
sex                          0
on_thyroxine                 0
query_on_thyroxine           0
on_antithyroid_medication    0
sick                         0
pregnant                     0
thyroid_surgery              0
I131_treatment               0
query_hypothyroid            0
query_hyperthyroid           0
lithium                      0
goitre                       0
tumor                        0
hypopituitary                0
psych                        0
TSH                          0
T3                           0
TT4                          0
T4U                          0
FTI                          0
Class                        0
referral_source_SVHC         0
referral_source_SVHD         0
referral_source_SVI          0
referral_source_other        0
dtype: int64


# Check data
new_df.describe()


# Create histograms
cols = ['age','TSH','T3','TT4','T4U','FTI']

plt.figure(figsize=(15,10),facecolor='white')
plotnumber = 1

for col in cols:
    ax = plt.subplot(2,3,plotnumber)
    sns.histplot(new_df[col], kde=True, ax=ax)
#     sns.boxplot(x=new_df[col], ax=ax)
    plt.xlabel(col, fontsize=10)
    plotnumber+=1


# Create boxplots
cols = ['age','TSH','T3','TT4','T4U','FTI']

plt.figure(figsize=(15,10),facecolor='white')
plotnumber = 1

for col in cols:
    ax = plt.subplot(2,3,plotnumber)
    sns.boxplot(x=new_df[col])
    plt.xlabel(col, fontsize=10)
    plotnumber+=1


# Recrods with age > 100
len(new_df[new_df['age']>100])

2


# Recrods with TSH > 150
len(new_df[new_df['TSH']>150]['TSH'])

19


len(new_df[new_df['age']>100])

2


new_df[(new_df['age']>100) | (new_df['TSH']>150)]


# Subset data
df2 = new_df[(new_df['age']<=100) & (new_df['TSH']<=150)]
df2.shape

(3951, 26)


# Create histograms
cols = ['age','TSH','T3','TT4','T4U','FTI']

plt.figure(figsize=(15,10),facecolor='white')
plotnumber = 1

for col in cols:
    ax = plt.subplot(2,3,plotnumber)
    sns.histplot(df2[col])
    plt.xlabel(col, fontsize=10)
    plotnumber+=1


# Create histograms
cols = ['age','TSH','T3','TT4','T4U','FTI']

plt.figure(figsize=(15,10),facecolor='white')
plotnumber = 1

for col in cols:
    ax = plt.subplot(2,3,plotnumber)
    sns.histplot(np.log(df2[col]+1e-8))
    plt.xlabel(col, fontsize=10)
    plotnumber+=1


# Create histograms
cols = ['age','TSH','T3','TT4','T4U','FTI']

plt.figure(figsize=(15,10),facecolor='white')
plotnumber = 1
param=[]
for col in cols:
    ax = plt.subplot(2,3,plotnumber)
    col_bx,p = stats.boxcox(df2[col]+1e-8)
    sns.histplot(col_bx)
    param.append(p)
    plt.xlabel(col, fontsize=10)
    plotnumber+=1


# Transformation parameters for eahc feature
param

[1.0550222277705679,
 0.1537593101418859,
 0.62709777401318,
 0.6048130750519881,
 0.7376933654702804,
 0.656094585046508]


# Check distribution of cateogircal `Class` feature
sns.countplot(x='Class', data=df2);


x = df2.drop(['Class'], axis=1)
y = df2['Class']

sampler = RandomOverSampler()
x_sampled, y_sampled = sampler.fit_resample(x,y)


# Check shape
x_sampled.shape

(14664, 25)


# Check data for each category
sns.countplot(y_sampled);

/Users/mohi9282/opt/anaconda3/envs/pytorchenv/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning


# Create df
x_sampled = pd.DataFrame(x_sampled, columns=x.columns)
x_sampled.head()


wcss = []
for i in range(1,10):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    pred = kmeans.fit(x_sampled)
    wcss.append(pred.inertia_)


plt.plot(range(1,10), wcss)

[<matplotlib.lines.Line2D at 0x7fe968e19898>]


kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
pred = kmeans.fit(x_sampled)


tdf = pd.read_csv('test_cluster')
tdf.shape

(6127, 28)


tdf.drop(['Cluster','Unnamed: 0'], inplace=True, axis=1)


tdf.columns

Index(['age', 'sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery', 'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'referral_source_SVHC', 'referral_source_SVHD', 'referral_source_SVI', 'referral_source_other', 'Label'], dtype='object')


X = tdf.drop('Label', axis=1)
Y = tdf['Label']
x_train, x_test, y_train, y_test = train_test_split(X, Y,
                                                    test_size=1/3, random_state=101)


y_train.shape

(4084,)


from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score


%%time
xgb = XGBClassifier(objective='multi:softmax')
param_grid = {'learning_rate': [0.5, 0.1, 0.01, 0.001],
              'max_depth': [3, 5, 10, 20],
              'n_estimators': [10, 50, 100, 200]}

grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid,
                           cv=5, verbose=3)
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[18:19:41] WARNING: /opt/concourse/worker/volumes/live/7a2b9f41-3287-451b-6691-43e9a6c0910f/volume/xgboost-split_1619728204606/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[CV 1/5] END learning_rate=0.5, max_depth=3, n_estimators=10;, score=0.991 total time=   0.1s
[18:19:41] WARNING: /opt/concourse/worker/volumes/live/7a2b9f41-3287-451b-6691-43e9a6c0910f/volume/xgboost-split_1619728204606/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[CV 2/5] END learning_rate=0.5, max_depth=3, n_estimators=10;, score=0.990 total time=   0.1s
[18:19:41] WARNING: /opt/concourse/worker/volumes/live/7a2b9f41-3287-451b-6691-43e9a6c0910f/volume/xgboost-split_1619728204606/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[CV 3/5] END learning_rate=0.5, max_depth=3, n_estimators=10;, score=0.994 total time=   0.0s
[18:19:41] WARNING: /opt/concourse/worker/volumes/live/7a2b9f41-3287-451b-6691-43e9a6c0910f/volume/xgboost-split_1619728204606/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[CV 4/5] END learning_rate=0.5, max_depth=3, n_estimators=10;, score=0.993 total time=   0.0s
[18:19:41] WARNING: /opt/concourse/worker/volumes/live/7a2b9f41-3287-451b-6691-43e9a6c0910f/volume/xgboost-split_1619728204606/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[CV 5/5] END learning_rate=0.5, max_depth=3, n_estimators=10;, score=0.996 total time=   0.0s
[18:19:41] WARNING: /opt/concourse/worker/volumes/live/7a2b9f41-3287-451b-6691-43e9a6c0910f/volume/xgboost-split_1619728204606/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

/Users/mohi9282/opt/anaconda3/envs/pytorchenv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)
/Users/mohi9282/opt/anaconda3/envs/pytorchenv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)
/Users/mohi9282/opt/anaconda3/envs/pytorchenv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)
/Users/mohi9282/opt/anaconda3/envs/pytorchenv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)
/Users/mohi9282/opt/anaconda3/envs/pytorchenv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)
/Users/mohi9282/opt/anaconda3/envs/pytorchenv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)

[CV 1/5] END learning_rate=0.5, max_depth=3, n_estimators=50;, score=0.994 total time=   0.1s
[18:19:41] WARNING: /opt/concourse/worker/volumes/live/7a2b9f41-3287-451b-6691-43e9a6c0910f/volume/xgboost-split_1619728204606/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[CV 2/5] END learning_rate=0.5, max_depth=3, n_estimators=50;, score=0.993 total time=   0.1s
[18:19:41] WARNING: /opt/concourse/worker/volumes/live/7a2b9f41-3287-451b-6691-43e9a6c0910f/volume/xgboost-split_1619728204606/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[CV 3/5] END learning_rate=0.5, max_depth=3, n_estimators=50;, score=0.995 total time=   0.1s
[18:19:41] WARNING: /opt/concourse/worker/volumes/live/7a2b9f41-3287-451b-6691-43e9a6c0910f/volume/xgboost-split_1619728204606/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

/Users/mohi9282/opt/anaconda3/envs/pytorchenv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)
/Users/mohi9282/opt/anaconda3/envs/pytorchenv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)
/Users/mohi9282/opt/anaconda3/envs/pytorchenv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)

[CV 4/5] END learning_rate=0.5, max_depth=3, n_estimators=50;, score=0.994 total time=   0.1s
[18:19:42] WARNING: /opt/concourse/worker/volumes/live/7a2b9f41-3287-451b-6691-43e9a6c0910f/volume/xgboost-split_1619728204606/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[CV 5/5] END learning_rate=0.5, max_depth=3, n_estimators=50;, score=0.998 total time=   0.1s
[18:19:42] WARNING: /opt/concourse/worker/volumes/live/7a2b9f41-3287-451b-6691-43e9a6c0910f/volume/xgboost-split_1619728204606/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

/Users/mohi9282/opt/anaconda3/envs/pytorchenv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)
/Users/mohi9282/opt/anaconda3/envs/pytorchenv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)

[CV 1/5] END learning_rate=0.5, max_depth=3, n_estimators=100;, score=0.995 total time=   0.2s
[18:19:42] WARNING: /opt/concourse/worker/volumes/live/7a2b9f41-3287-451b-6691-43e9a6c0910f/volume/xgboost-split_1619728204606/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[CV 2/5] END learning_rate=0.5, max_depth=3, n_estimators=100;, score=0.991 total time=   0.2s
[18:19:42] WARNING: /opt/concourse/worker/volumes/live/7a2b9f41-3287-451b-6691-43e9a6c0910f/volume/xgboost-split_1619728204606/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

/Users/mohi9282/opt/anaconda3/envs/pytorchenv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)
/Users/mohi9282/opt/anaconda3/envs/pytorchenv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)

[CV 3/5] END learning_rate=0.5, max_depth=3, n_estimators=100;, score=0.995 total time=   0.2s
[18:19:42] WARNING: /opt/concourse/worker/volumes/live/7a2b9f41-3287-451b-6691-43e9a6c0910f/volume/xgboost-split_1619728204606/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[CV 4/5] END learning_rate=0.5, max_depth=3, n_estimators=100;, score=0.994 total time=   0.2s
[18:19:42] WARNING: /opt/concourse/worker/volumes/live/7a2b9f41-3287-451b-6691-43e9a6c0910f/volume/xgboost-split_1619728204606/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

/Users/mohi9282/opt/anaconda3/envs/pytorchenv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)
/Users/mohi9282/opt/anaconda3/envs/pytorchenv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)


grid_search.best_params_

{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200}


xgb_model = XGBClassifier(max_depth=10, learning_rate=0.1,
                          n_estimators=200)
xgb_model.fit(x_train, y_train)

[18:22:38] WARNING: /opt/concourse/worker/volumes/live/7a2b9f41-3287-451b-6691-43e9a6c0910f/volume/xgboost-split_1619728204606/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=12, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


pred_xgb = xgb_model.predict_proba(x_test)
roc_auc_score(y_test, pred_xgb, multi_class='ovr')

0.9999967572787911


from sklearn.svm import SVC


svc = SVC(kernel='rbf', probability=True)
param_grid = {'C': [0.1,1, 10], 'gamma': [0.1,0.01,0.001]}
# param_grid = {'C': [0.1], 'gamma': [0.1],'kernel': ['rbf', 'poly']}
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid,
                           cv=5, verbose=3)
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END ..................C=0.1, gamma=0.1;, score=0.869 total time=   4.2s
[CV 2/5] END ..................C=0.1, gamma=0.1;, score=0.849 total time=   4.1s
[CV 3/5] END ..................C=0.1, gamma=0.1;, score=0.846 total time=   4.0s
[CV 4/5] END ..................C=0.1, gamma=0.1;, score=0.865 total time=   4.2s
[CV 5/5] END ..................C=0.1, gamma=0.1;, score=0.884 total time=   4.2s
[CV 1/5] END .................C=0.1, gamma=0.01;, score=0.971 total time=   2.8s
[CV 2/5] END .................C=0.1, gamma=0.01;, score=0.977 total time=   2.9s
[CV 3/5] END .................C=0.1, gamma=0.01;, score=0.977 total time=   2.7s
[CV 4/5] END .................C=0.1, gamma=0.01;, score=0.976 total time=   2.7s
[CV 5/5] END .................C=0.1, gamma=0.01;, score=0.982 total time=   2.9s
[CV 1/5] END ................C=0.1, gamma=0.001;, score=0.955 total time=   2.0s
[CV 2/5] END ................C=0.1, gamma=0.001;, score=0.957 total time=   1.9s
[CV 3/5] END ................C=0.1, gamma=0.001;, score=0.963 total time=   1.8s
[CV 4/5] END ................C=0.1, gamma=0.001;, score=0.944 total time=   1.8s
[CV 5/5] END ................C=0.1, gamma=0.001;, score=0.962 total time=   1.8s
[CV 1/5] END ....................C=1, gamma=0.1;, score=1.000 total time=   3.0s
[CV 2/5] END ....................C=1, gamma=0.1;, score=1.000 total time=   3.0s
[CV 3/5] END ....................C=1, gamma=0.1;, score=1.000 total time=   3.0s
[CV 4/5] END ....................C=1, gamma=0.1;, score=1.000 total time=   3.0s
[CV 5/5] END ....................C=1, gamma=0.1;, score=1.000 total time=   3.0s
[CV 1/5] END ...................C=1, gamma=0.01;, score=0.993 total time=   1.5s
[CV 2/5] END ...................C=1, gamma=0.01;, score=0.996 total time=   1.5s
[CV 3/5] END ...................C=1, gamma=0.01;, score=0.982 total time=   1.4s
[CV 4/5] END ...................C=1, gamma=0.01;, score=0.989 total time=   1.4s
[CV 5/5] END ...................C=1, gamma=0.01;, score=0.979 total time=   1.5s
[CV 1/5] END ..................C=1, gamma=0.001;, score=0.972 total time=   0.9s
[CV 2/5] END ..................C=1, gamma=0.001;, score=0.976 total time=   0.9s
[CV 3/5] END ..................C=1, gamma=0.001;, score=0.972 total time=   0.9s
[CV 4/5] END ..................C=1, gamma=0.001;, score=0.966 total time=   0.8s
[CV 5/5] END ..................C=1, gamma=0.001;, score=0.971 total time=   0.9s
[CV 1/5] END ...................C=10, gamma=0.1;, score=1.000 total time=   2.8s
[CV 2/5] END ...................C=10, gamma=0.1;, score=1.000 total time=   3.2s
[CV 3/5] END ...................C=10, gamma=0.1;, score=1.000 total time=   2.9s
[CV 4/5] END ...................C=10, gamma=0.1;, score=1.000 total time=   2.8s
[CV 5/5] END ...................C=10, gamma=0.1;, score=1.000 total time=   2.8s
[CV 1/5] END ..................C=10, gamma=0.01;, score=0.991 total time=   1.2s
[CV 2/5] END ..................C=10, gamma=0.01;, score=0.998 total time=   1.2s
[CV 3/5] END ..................C=10, gamma=0.01;, score=0.990 total time=   1.2s
[CV 4/5] END ..................C=10, gamma=0.01;, score=0.991 total time=   1.2s
[CV 5/5] END ..................C=10, gamma=0.01;, score=0.993 total time=   1.2s
[CV 1/5] END .................C=10, gamma=0.001;, score=0.979 total time=   0.5s
[CV 2/5] END .................C=10, gamma=0.001;, score=0.988 total time=   0.5s
[CV 3/5] END .................C=10, gamma=0.001;, score=0.978 total time=   0.5s
[CV 4/5] END .................C=10, gamma=0.001;, score=0.973 total time=   0.5s
[CV 5/5] END .................C=10, gamma=0.001;, score=0.973 total time=   0.5s

GridSearchCV(cv=5, estimator=SVC(probability=True),
             param_grid={'C': [0.1, 1, 10], 'gamma': [0.1, 0.01, 0.001]},
             verbose=3)


grid_search.best_params_

{'C': 1, 'gamma': 0.1}


svm_model = SVC(kernel='rbf', probability=True,
                C=1, gamma=0.1)
svm_model.fit(x_train, y_train)

SVC(C=1, gamma=0.1, probability=True)


pred_svm = svm_model.predict_proba(x_test)
roc_auc_score(y_test, pred_svm, multi_class='ovr')

1.0

	TSH	T3_measured	T3	TT4_measured	TT4	T4U_measured	T4U	FTI_measured	FTI	TBG_measured	TBG
0	1.0	t	1.6	t	93.0	t	0.94	t	99.0	f	NaN
1	1.1	t	2.9	t	125.0	t	1.02	t	122.0	f	NaN
2	2.2	t	1.7	t	83.0	t	1.11	t	76.0	f	NaN
3	1.1	t	1.8	t	94.0	t	0.94	t	100.0	f	NaN
4	NaN	f	NaN	f	NaN	f	NaN	f	NaN	f	NaN

	age	sex	on_thyroxine	query_on_thyroxine	on_antithyroid_medication	sick	pregnant	thyroid_surgery	I131_treatment	query_hypothyroid	query_hyperthyroid	lithium	goitre	tumor	hypopituitary	psych	TSH	T3	TT4	T4U	FTI	Class	referral_source_SVHC	referral_source_SVHD	referral_source_SVI	referral_source_other
count	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000	3972.000000
mean	51.681017	0.309416	0.122860	0.014099	0.011581	0.039275	0.013847	0.013847	0.016616	0.060171	0.061934	0.005035	0.008812	0.025176	0.000252	0.048338	4.816717	2.035247	108.469789	1.020393	110.100453	0.974824	0.103474	0.010070	0.273414	0.583333
std	21.079318	0.462311	0.328318	0.117913	0.107004	0.194273	0.116870	0.116870	0.127845	0.237834	0.241065	0.070790	0.093468	0.156680	0.015867	0.214507	23.011349	0.780502	34.717478	0.162878	31.999075	0.278266	0.304616	0.099858	0.445768	0.493069
min	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2.000000	0.000000	2.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	36.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	2.000000	88.000000	1.000000	93.000000	1.000000	0.000000	0.000000	0.000000	0.000000
50%	54.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	2.000000	104.000000	1.000000	107.000000	1.000000	0.000000	0.000000	0.000000	1.000000
75%	67.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	3.000000	2.000000	124.000000	1.000000	124.000000	1.000000	0.000000	0.000000	1.000000	1.000000
max	455.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	530.000000	11.000000	430.000000	2.000000	395.000000	3.000000	1.000000	1.000000	1.000000	1.000000

EDA - Thyroid Classification¶

Table of Contents

Introduction¶

Read the data¶

Data Exploration¶

Missing Values¶

Identify missing¶

Replace `?` with `Nan`¶

Remove columns with duplicate information¶

Feature Engineering¶

Convert `sex` column to numerical¶

Convert columns with binary categories to numerical¶

One-Hot encode `referral source`¶

Label encode `Class`¶

Impute Missing¶

Handle Outliers¶

Remove recrods with `age > 100` and `TSH > 150`¶

Data Transformation¶

Apply log transformation¶

Apply Box-cox tranformation¶

Distribution of `Class`¶

Oversample using `RandomOverSampler()`¶

Clustering¶

Modeling¶

XGB¶

SVM¶

Column	Description	Column	Description
age	Age of the person	TSH_measured	true or false
sex	Male or Female	TSH	thyroid stimulating hormone floating value
on_thyroxine	true or false	T3_measured	true or false
on_antithyroid_medication	true or false	T3	triiodothyronine value
sick	true or false	TT4_measured	true or false
pregnant	true or false	TT4	Thyroxine value
thyroid_surgery	true or false	T4U_measured	true or false
I131_treatment	true or false	T4U	numerical value
query_hypothyroid	true or false	FTI_measured	true or false
query_hyperthyroid	true or false	FTI	Free Thyroxine Index
lithium	true or false	TBG_measured	true or false
goitre	true or false	TBG	Thyroid Binding Globulin value
tumor	true or false	referral_source	different sources of referals
hypopituitary	true or false	Class	different types of thyroid
psych	true or false

	age	sex	on_thyroxine	query_on_thyroxine	on_antithyroid_medication	sick	pregnant	thyroid_surgery	I131_treatment	query_hypothyroid	query_hyperthyroid	lithium	goitre	tumor	hypopituitary	psych	TSH	T3	TT4	T4U	FTI	referral_source	Class
count	3971	3815	3972	3972	3972	3972	3972	3972	3972	3972	3972	3972	3972	3972	3972	3972	3582	3160	3725	3563	3565	3972	3972
unique	93	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	287	69	241	146	234	5	4
top	59	F	f	f	f	f	f	f	f	f	f	f	f	f	f	f	0.2	2.0	101.0	0.99	100.0	other	negative
freq	102	2608	3484	3916	3926	3816	3917	3917	3906	3733	3726	3952	3937	3872	3971	3780	119	251	72	101	77	2317	3668

	age	sex	on_antithyroid_medication	sick	psych	TSH	T3	TT4	T4U	FTI	Class	referral_source_SVHC	referral_source_other
0	49.0	0.0	0.0	1.0	0.0	1.0	2.0	93.0	1.0	99.0	1.0	0.0	1.0
1	46.0	1.0	0.0	0.0	1.0	1.0	3.0	125.0	1.0	122.0	1.0	1.0	0.0
2	80.0	0.0	1.0	0.0	0.0	2.0	2.0	83.0	1.0	76.0	1.0	0.0	1.0
3	80.0	0.0	0.0	0.0	1.0	1.0	2.0	94.0	1.0	100.0	1.0	1.0	0.0
4	26.0	0.0	0.0	0.0	0.0	2.0	3.0	108.0	1.0	90.0	1.0	0.0	1.0

	age	sex	on_thyroxine	query_hypothyroid	query_hyperthyroid	TSH	T3	TT4	T4U	FTI	Class	referral_source_SVI	referral_source_other
288	39.0	0.0	0.0	0.0	0.0	160.0	0.0	11.0	1.0	9.0	2.0	0.0	1.0
316	50.0	1.0	0.0	0.0	0.0	151.0	1.0	32.0	1.0	28.0	2.0	0.0	1.0
639	33.0	1.0	0.0	0.0	0.0	160.0	0.0	10.0	1.0	13.0	2.0	1.0	0.0
937	53.0	0.0	0.0	0.0	0.0	165.0	0.0	17.0	1.0	14.0	2.0	0.0	1.0
1018	60.0	0.0	0.0	0.0	0.0	151.0	1.0	42.0	1.0	39.0	2.0	0.0	1.0
1307	2.0	0.0	0.0	0.0	0.0	472.0	2.0	34.0	1.0	29.0	2.0	0.0	0.0
1564	455.0	0.0	0.0	0.0	0.0	1.0	2.0	118.0	1.0	104.0	1.0	1.0	0.0
1575	53.0	0.0	0.0	0.0	0.0	183.0	1.0	14.0	1.0	11.0	2.0	0.0	1.0
1902	60.0	0.0	0.0	1.0	0.0	183.0	1.0	45.0	1.0	46.0	2.0	0.0	1.0
1972	25.0	0.0	0.0	0.0	0.0	468.0	1.0	21.0	1.0	19.0	2.0	0.0	1.0
2027	18.0	0.0	0.0	0.0	0.0	440.0	0.0	24.0	1.0	18.0	2.0	0.0	1.0
2049	59.0	0.0	0.0	0.0	0.0	530.0	2.0	10.0	1.0	8.0	2.0	0.0	1.0
2239	44.0	1.0	0.0	0.0	0.0	199.0	1.0	10.0	1.0	10.0	2.0	1.0	0.0
2259	31.0	0.0	0.0	0.0	0.0	188.0	1.0	63.0	1.0	53.0	2.0	0.0	1.0
2777	25.0	1.0	0.0	0.0	0.0	236.0	0.0	16.0	1.0	17.0	2.0	0.0	1.0
3039	60.0	0.0	1.0	0.0	0.0	400.0	0.0	11.0	1.0	9.0	2.0	0.0	1.0
3054	35.0	0.0	0.0	0.0	1.0	230.0	2.0	36.0	1.0	27.0	2.0	0.0	1.0
3337	18.0	0.0	1.0	0.0	0.0	478.0	3.0	45.0	1.0	34.0	2.0	0.0	1.0
3736	455.0	0.0	0.0	0.0	0.0	1.0	2.0	118.0	1.0	104.0	1.0	1.0	0.0
3747	53.0	0.0	0.0	0.0	0.0	183.0	1.0	14.0	1.0	11.0	2.0	0.0	1.0
3787	48.0	0.0	1.0	1.0	0.0	178.0	1.0	63.0	1.0	59.0	2.0	0.0	1.0

EDA - Thyroid Classification¶

Table of Contents

Introduction¶

Read the data¶

Data Exploration¶

Missing Values¶

Identify missing¶

Replace ? with Nan¶

Remove columns with duplicate information¶

Feature Engineering¶

Convert sex column to numerical¶

Convert columns with binary categories to numerical¶

One-Hot encode referral source¶

Label encode Class¶

Impute Missing¶

Handle Outliers¶

Remove recrods with age > 100 and TSH > 150¶

Data Transformation¶

Apply log transformation¶

Apply Box-cox tranformation¶

Distribution of Class¶

Oversample using RandomOverSampler()¶

Clustering¶

Modeling¶

XGB¶

SVM¶

Replace `?` with `Nan`¶

Convert `sex` column to numerical¶

One-Hot encode `referral source`¶

Label encode `Class`¶

Remove recrods with `age > 100` and `TSH > 150`¶

Distribution of `Class`¶

Oversample using `RandomOverSampler()`¶