Ejemplo de ciencia de datos

Contents

Ejemplo de ciencia de datos#

Lectura del dataset#

# Ruta donde se encuentran los datos en Github
path_datos = "https://raw.githubusercontent.com/BioAITeamLearning/IntroPython_2024_01_UAI/main/Data/"

import pandas as pd
# Leer el dataset
df = pd.read_csv(path_datos+"/wisc_bc_data.csv")
df2 = pd.read_csv(path_datos+"/BDParkinson_Prediction.csv")

# Mostrar el dataframe
df.head(20)

	id	diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	...	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst
0	842302	M	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.30010	0.14710	...	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.26540	0.4601	0.11890
1	842517	M	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.08690	0.07017	...	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.18600	0.2750	0.08902
2	84300903	M	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.19740	0.12790	...	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.24300	0.3613	0.08758
3	84348301	M	11.42	20.38	77.58	386.1	0.14250	0.28390	0.24140	0.10520	...	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.25750	0.6638	0.17300
4	84358402	M	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.19800	0.10430	...	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.16250	0.2364	0.07678
5	843786	M	12.45	15.70	82.57	477.1	0.12780	0.17000	0.15780	0.08089	...	15.47	23.75	103.40	741.6	0.1791	0.5249	0.5355	0.17410	0.3985	0.12440
6	844359	M	18.25	19.98	119.60	1040.0	0.09463	0.10900	0.11270	0.07400	...	22.88	27.66	153.20	1606.0	0.1442	0.2576	0.3784	0.19320	0.3063	0.08368
7	84458202	M	13.71	20.83	90.20	577.9	0.11890	0.16450	0.09366	0.05985	...	17.06	28.14	110.60	897.0	0.1654	0.3682	0.2678	0.15560	0.3196	0.11510
8	844981	M	13.00	21.82	87.50	519.8	0.12730	0.19320	0.18590	0.09353	...	15.49	30.73	106.20	739.3	0.1703	0.5401	0.5390	0.20600	0.4378	0.10720
9	84501001	M	12.46	24.04	83.97	475.9	0.11860	0.23960	0.22730	0.08543	...	15.09	40.68	97.65	711.4	0.1853	1.0580	1.1050	0.22100	0.4366	0.20750
10	845636	M	16.02	23.24	102.70	797.8	0.08206	0.06669	0.03299	0.03323	...	19.19	33.88	123.80	1150.0	0.1181	0.1551	0.1459	0.09975	0.2948	0.08452
11	84610002	M	15.78	17.89	103.60	781.0	0.09710	0.12920	0.09954	0.06606	...	20.42	27.28	136.50	1299.0	0.1396	0.5609	0.3965	0.18100	0.3792	0.10480
12	846226	M	19.17	24.80	132.40	1123.0	0.09740	0.24580	0.20650	0.11180	...	20.96	29.94	151.70	1332.0	0.1037	0.3903	0.3639	0.17670	0.3176	0.10230
13	846381	M	15.85	23.95	103.70	782.7	0.08401	0.10020	0.09938	0.05364	...	16.84	27.66	112.00	876.5	0.1131	0.1924	0.2322	0.11190	0.2809	0.06287
14	84667401	M	13.73	22.61	93.60	578.3	0.11310	0.22930	0.21280	0.08025	...	15.03	32.01	108.80	697.7	0.1651	0.7725	0.6943	0.22080	0.3596	0.14310
15	84799002	M	14.54	27.54	96.73	658.8	0.11390	0.15950	0.16390	0.07364	...	17.46	37.13	124.10	943.2	0.1678	0.6577	0.7026	0.17120	0.4218	0.13410
16	848406	M	14.68	20.13	94.74	684.5	0.09867	0.07200	0.07395	0.05259	...	19.07	30.88	123.40	1138.0	0.1464	0.1871	0.2914	0.16090	0.3029	0.08216
17	84862001	M	16.13	20.68	108.10	798.8	0.11700	0.20220	0.17220	0.10280	...	20.96	31.48	136.80	1315.0	0.1789	0.4233	0.4784	0.20730	0.3706	0.11420
18	849014	M	19.81	22.15	130.00	1260.0	0.09831	0.10270	0.14790	0.09498	...	27.32	30.88	186.80	2398.0	0.1512	0.3150	0.5372	0.23880	0.2768	0.07615
19	8510426	B	13.54	14.36	87.46	566.3	0.09779	0.08129	0.06664	0.04781	...	15.11	19.26	99.70	711.2	0.1440	0.1773	0.2390	0.12880	0.2977	0.07259

20 rows × 32 columns

# Mostrar todas las columnas del dataframe
pd.options.display.max_columns = None
# Mostrar el dataframe ya con todas las columnas
df.head(5)

	id	diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	fractal_dimension_mean	radius_se	texture_se	perimeter_se	area_se	smoothness_se	compactness_se	concavity_se	concave points_se	symmetry_se	fractal_dimension_se	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst
0	842302	M	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	0.2419	0.07871	1.0950	0.9053	8.589	153.40	0.006399	0.04904	0.05373	0.01587	0.03003	0.006193	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
1	842517	M	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	0.1812	0.05667	0.5435	0.7339	3.398	74.08	0.005225	0.01308	0.01860	0.01340	0.01389	0.003532	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902
2	84300903	M	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	0.2069	0.05999	0.7456	0.7869	4.585	94.03	0.006150	0.04006	0.03832	0.02058	0.02250	0.004571	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758
3	84348301	M	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	0.2597	0.09744	0.4956	1.1560	3.445	27.23	0.009110	0.07458	0.05661	0.01867	0.05963	0.009208	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300
4	84358402	M	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	0.1809	0.05883	0.7572	0.7813	5.438	94.44	0.011490	0.02461	0.05688	0.01885	0.01756	0.005115	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678

# Mostrar nombres de las columnas
list(df.columns.values)

['id',
 'diagnosis',
 'radius_mean',
 'texture_mean',
 'perimeter_mean',
 'area_mean',
 'smoothness_mean',
 'compactness_mean',
 'concavity_mean',
 'concave points_mean',
 'symmetry_mean',
 'fractal_dimension_mean',
 'radius_se',
 'texture_se',
 'perimeter_se',
 'area_se',
 'smoothness_se',
 'compactness_se',
 'concavity_se',
 'concave points_se',
 'symmetry_se',
 'fractal_dimension_se',
 'radius_worst',
 'texture_worst',
 'perimeter_worst',
 'area_worst',
 'smoothness_worst',
 'compactness_worst',
 'concavity_worst',
 'concave points_worst',
 'symmetry_worst',
 'fractal_dimension_worst']

Eliminar columnas innecesarias del dataset#

# Eliminar la columna de identificación, esta no se puede usar como feature
df = df.drop(['id'], axis=1)

# Mostramos el df sin la identificación
df.head()

	diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	fractal_dimension_mean	radius_se	texture_se	perimeter_se	area_se	smoothness_se	compactness_se	concavity_se	concave points_se	symmetry_se	fractal_dimension_se	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst
0	M	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	0.2419	0.07871	1.0950	0.9053	8.589	153.40	0.006399	0.04904	0.05373	0.01587	0.03003	0.006193	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
1	M	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	0.1812	0.05667	0.5435	0.7339	3.398	74.08	0.005225	0.01308	0.01860	0.01340	0.01389	0.003532	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902
2	M	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	0.2069	0.05999	0.7456	0.7869	4.585	94.03	0.006150	0.04006	0.03832	0.02058	0.02250	0.004571	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758
3	M	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	0.2597	0.09744	0.4956	1.1560	3.445	27.23	0.009110	0.07458	0.05661	0.01867	0.05963	0.009208	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300
4	M	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	0.1809	0.05883	0.7572	0.7813	5.438	94.44	0.011490	0.02461	0.05688	0.01885	0.01756	0.005115	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678

Análisis exploratorio de datos (Exploratory data analysis - EDA)#

# Cantidad de clases
print(f'Número de clases {len(df["diagnosis"].value_counts())}')

Número de clases 2

# Frecuencia por clase
print(df["diagnosis"].value_counts())

ax = df['diagnosis'].value_counts().plot(kind='bar')
ax.set_title('Clases')
ax.set_ylabel('Frecuencia')

diagnosis
B    357
M    212
Name: count, dtype: int64

Text(0, 0.5, 'Frecuencia')

_images/ed5eb18b837e534c2430af4394333a151a00138c86fb7f85b7e79455cab1428d.png

# Cuenta de datos en las features, se puede verificar que no haya features con valores nulos
df.groupby("diagnosis").count()

	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	fractal_dimension_mean	radius_se	texture_se	perimeter_se	area_se	smoothness_se	compactness_se	concavity_se	concave points_se	symmetry_se	fractal_dimension_se	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst
diagnosis
B	357	357	357	357	357	357	357	357	357	357	357	357	357	357	357	357	357	357	357	357	357	357	357	357	357	357	357	357	357	357
M	212	212	212	212	212	212	212	212	212	212	212	212	212	212	212	212	212	212	212	212	212	212	212	212	212	212	212	212	212	212

# Renombrar features con espacios en los nombres
df.rename(columns={'concave points_mean':'concave_points_mean',
                   'concave points_se':'concave_points_se',
                   'concave points_worst':'concave_points_worst'},inplace=True)

# Nombre de las features, conteo de cantidades, verificación de valores nulos, ver tipo de dato de cada feature
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 diagnosis                569 non-null    object 
 radius_mean              569 non-null    float64
 texture_mean             569 non-null    float64
 perimeter_mean           569 non-null    float64
 area_mean                569 non-null    float64
 smoothness_mean          569 non-null    float64
 compactness_mean         569 non-null    float64
 concavity_mean           569 non-null    float64
 concave_points_mean      569 non-null    float64
 symmetry_mean            569 non-null    float64
fractal_dimension_mean   569 non-null    float64
radius_se                569 non-null    float64
texture_se               569 non-null    float64
perimeter_se             569 non-null    float64
area_se                  569 non-null    float64
smoothness_se            569 non-null    float64
compactness_se           569 non-null    float64
concavity_se             569 non-null    float64
concave_points_se        569 non-null    float64
symmetry_se              569 non-null    float64
fractal_dimension_se     569 non-null    float64
radius_worst             569 non-null    float64
texture_worst            569 non-null    float64
perimeter_worst          569 non-null    float64
area_worst               569 non-null    float64
smoothness_worst         569 non-null    float64
compactness_worst        569 non-null    float64
concavity_worst          569 non-null    float64
concave_points_worst     569 non-null    float64
symmetry_worst           569 non-null    float64
fractal_dimension_worst  569 non-null    float64
dtypes: float64(30), object(1)
memory usage: 137.9+ KB

# Estadísticas básicas en las features
# Cuenta de datos, media, desviación estandar, valor mínimo, cuartiles, y valor máximo
df.describe()

	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave_points_mean	symmetry_mean	fractal_dimension_mean	radius_se	texture_se	perimeter_se	area_se	smoothness_se	compactness_se	concavity_se	concave_points_se	symmetry_se	fractal_dimension_se	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave_points_worst	symmetry_worst	fractal_dimension_worst
count	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000
mean	14.127292	19.289649	91.969033	654.889104	0.096360	0.104341	0.088799	0.048919	0.181162	0.062798	0.405172	1.216853	2.866059	40.337079	0.007041	0.025478	0.031894	0.011796	0.020542	0.003795	16.269190	25.677223	107.261213	880.583128	0.132369	0.254265	0.272188	0.114606	0.290076	0.083946
std	3.524049	4.301036	24.298981	351.914129	0.014064	0.052813	0.079720	0.038803	0.027414	0.007060	0.277313	0.551648	2.021855	45.491006	0.003003	0.017908	0.030186	0.006170	0.008266	0.002646	4.833242	6.146258	33.602542	569.356993	0.022832	0.157336	0.208624	0.065732	0.061867	0.018061
min	6.981000	9.710000	43.790000	143.500000	0.052630	0.019380	0.000000	0.000000	0.106000	0.049960	0.111500	0.360200	0.757000	6.802000	0.001713	0.002252	0.000000	0.000000	0.007882	0.000895	7.930000	12.020000	50.410000	185.200000	0.071170	0.027290	0.000000	0.000000	0.156500	0.055040
25%	11.700000	16.170000	75.170000	420.300000	0.086370	0.064920	0.029560	0.020310	0.161900	0.057700	0.232400	0.833900	1.606000	17.850000	0.005169	0.013080	0.015090	0.007638	0.015160	0.002248	13.010000	21.080000	84.110000	515.300000	0.116600	0.147200	0.114500	0.064930	0.250400	0.071460
50%	13.370000	18.840000	86.240000	551.100000	0.095870	0.092630	0.061540	0.033500	0.179200	0.061540	0.324200	1.108000	2.287000	24.530000	0.006380	0.020450	0.025890	0.010930	0.018730	0.003187	14.970000	25.410000	97.660000	686.500000	0.131300	0.211900	0.226700	0.099930	0.282200	0.080040
75%	15.780000	21.800000	104.100000	782.700000	0.105300	0.130400	0.130700	0.074000	0.195700	0.066120	0.478900	1.474000	3.357000	45.190000	0.008146	0.032450	0.042050	0.014710	0.023480	0.004558	18.790000	29.720000	125.400000	1084.000000	0.146000	0.339100	0.382900	0.161400	0.317900	0.092080
max	28.110000	39.280000	188.500000	2501.000000	0.163400	0.345400	0.426800	0.201200	0.304000	0.097440	2.873000	4.885000	21.980000	542.200000	0.031130	0.135400	0.396000	0.052790	0.078950	0.029840	36.040000	49.540000	251.200000	4254.000000	0.222600	1.058000	1.252000	0.291000	0.663800	0.207500

df2

	VAR1	VAR2	VAR3	VAR4	VAR5	VAR6	CLASS
0	0.624731	0.135424	0.000000	0.675282	0.182203	0.962960	Class_1
1	0.647223	0.136211	0.000000	0.679511	0.195903	0.987387	Class_1
2	0.706352	0.187593	0.000000	0.632989	0.244884	0.991182	Class_1
3	0.680291	0.192076	0.000000	0.651786	0.233528	0.991857	Class_1
4	0.660104	0.161131	0.000000	0.677162	0.209531	0.991066	Class_1
...	...	...	...	...	...	...	...
495	0.712586	0.219776	0.510939	0.593045	0.268087	0.092055	Class_4
496	0.686058	0.224004	0.518661	0.600564	0.253298	0.093827	Class_4
497	0.698661	0.216604	0.505791	0.591165	0.241696	0.090734	Class_4
498	0.714926	0.222613	0.562420	0.587406	0.271037	0.093245	Class_4
499	0.698690	0.219577	0.541828	0.583647	0.258280	0.091973	Class_4

500 rows × 7 columns

# Histograma de cada feature, para ver las distribuciones en cada feature y detectar alguna anómala o con pocos datos fuera de rango o incluso features nulas
import matplotlib.pyplot as plt
df2.hist(figsize = (10,10), color='red')
plt.show()

_images/361dd6b17c2980a236dad536f0010187445eebe73bde9076b9c4db3bc78dd7a5.png

import seaborn as sns
# Matriz de correlación
df3=df2.drop(['CLASS'], axis=1)
#df3['VAR7']=2*df2['VAR1']
corr = df3.corr()

# Mapas de calor de la matriz de correlación
plt.figure(figsize=(5,5))
sns.heatmap(corr,fmt='.1f',annot=True,cmap='Reds')
plt.show()

_images/6b8ea7c47c39ebbd7c5d5f2c3234861e3152d21d968bab30c86fd5265db981ec.png

# Pairplot
sns.pairplot(df2, hue="CLASS")

<seaborn.axisgrid.PairGrid at 0x7a8d9bcffe80>

_images/294335a52ec582061daf5b686a665696eecfe7959818af9c04695875d2c2a45a.png

#plots
nrows=2
ncols=3

fig = plt.figure(figsize=(22,15))
fig.subplots_adjust(hspace=0.2, wspace=0.1)

###############################################
i=1
ax = fig.add_subplot(nrows, ncols, i)
sns.boxplot(data=df2, x="CLASS", y="VAR1")
ax.set_xlabel("CLASS")
ax.set_ylabel("VAR1")
ax.set_title('VAR1 vs. CLASS')
###############################################
i=2
ax = fig.add_subplot(nrows, ncols, i)
sns.boxplot(data=df2, x="CLASS", y="VAR2")
ax.set_xlabel("CLASS")
ax.set_ylabel("VAR2")
ax.set_title('VAR2 vs. CLASS')
###############################################
i=3
ax = fig.add_subplot(nrows, ncols, i)
sns.boxplot(data=df2, x="CLASS", y="VAR3")
ax.set_xlabel("CLASS")
ax.set_ylabel("VAR3")
ax.set_title('VAR3 vs. CLASS')
###############################################
i=4
ax = fig.add_subplot(nrows, ncols, i)
sns.boxplot(data=df2, x="CLASS", y="VAR4")
ax.set_xlabel("CLASS")
ax.set_ylabel("VAR4")
ax.set_title('VAR4 vs. CLASS')
###############################################
i=5
ax = fig.add_subplot(nrows, ncols, i)
sns.boxplot(data=df2, x="CLASS", y="VAR5")
ax.set_xlabel("CLASS")
ax.set_ylabel("VAR5")
ax.set_title('VAR5 vs. CLASS')
###############################################
i=6
ax = fig.add_subplot(nrows, ncols, i)
sns.boxplot(data=df2, x="CLASS", y="VAR6")
ax.set_xlabel("CLASS")
ax.set_ylabel("VAR6")
ax.set_title('VAR6 vs. CLASS')
plt.show()

_images/4394e0e03aaaf2e9e75d6e7cd6b9b232e06b49c871e01beb134ebde637cf94d8.png

sns.catplot(data=df2, kind="bar", x="CLASS", y="VAR3")

<seaborn.axisgrid.FacetGrid at 0x7a8d93311210>

_images/e02c4f7115fa23205a831ec548e991c9686a55bf642b19bac798074553872f64.png

sns.catplot(data=df2, kind="violin", x="CLASS", y="VAR5")

<seaborn.axisgrid.FacetGrid at 0x7a8d90e36770>

_images/c4febe3ec564b1622aad26762c39be7e64611b0441a6f764aa8ea1f8e4326f07.png

División en datos de entrenamiento y testing#

# Obtener las features
features = df.drop(['diagnosis'], axis=1)
features.head()

	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave_points_mean	symmetry_mean	fractal_dimension_mean	radius_se	texture_se	perimeter_se	area_se	smoothness_se	compactness_se	concavity_se	concave_points_se	symmetry_se	fractal_dimension_se	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave_points_worst	symmetry_worst	fractal_dimension_worst
0	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	0.2419	0.07871	1.0950	0.9053	8.589	153.40	0.006399	0.04904	0.05373	0.01587	0.03003	0.006193	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
1	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	0.1812	0.05667	0.5435	0.7339	3.398	74.08	0.005225	0.01308	0.01860	0.01340	0.01389	0.003532	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902
2	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	0.2069	0.05999	0.7456	0.7869	4.585	94.03	0.006150	0.04006	0.03832	0.02058	0.02250	0.004571	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758
3	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	0.2597	0.09744	0.4956	1.1560	3.445	27.23	0.009110	0.07458	0.05661	0.01867	0.05963	0.009208	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300
4	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	0.1809	0.05883	0.7572	0.7813	5.438	94.44	0.011490	0.02461	0.05688	0.01885	0.01756	0.005115	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678

# Obtener los labels
labels = df['diagnosis']
labels.head()

  M
  M
  M
  M
  M
Name: diagnosis, dtype: object

features.shape,labels.shape

((569, 30), (569,))

from sklearn.model_selection import train_test_split
# Separación de la data, con un 20% para testing, y 80% para entrenamiento
X_train, X_test, y_train, y_test = train_test_split(features, labels,
                                                    test_size=0.20, random_state=1, stratify=labels)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((455, 30), (114, 30), (455,), (114,))

import numpy as np

# Verificacion de la cantidad de datos para entrenamiento y para testing
print("y_train labels unique:",np.unique(y_train, return_counts=True))
print("y_test labels unique: ",np.unique(y_test, return_counts=True))

y_train labels unique: (array(['B', 'M'], dtype=object), array([285, 170]))
y_test labels unique:  (array(['B', 'M'], dtype=object), array([72, 42]))

K-Nearest Neighbors#

from sklearn.neighbors import KNeighborsClassifier
# Cargamos el modelo KNN sin entrenar
model_KNN = KNeighborsClassifier(n_neighbors=3)

# Entrenamos el modelo KNN
model_KNN.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

# Hacemos predicción en testing
y_pred = model_KNN.predict(X_test)

# Mostramos las predicciones
y_pred

array(['M', 'B', 'B', 'M', 'B', 'M', 'M', 'M', 'B', 'B', 'B', 'B', 'M',
       'B', 'M', 'B', 'B', 'M', 'M', 'B', 'M', 'B', 'B', 'M', 'M', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B',
       'M', 'M', 'B', 'B', 'B', 'B', 'B', 'M', 'M', 'M', 'B', 'M', 'M',
       'M', 'B', 'B', 'B', 'B', 'M', 'B', 'M', 'B', 'M', 'M', 'B', 'B',
       'M', 'M', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'M', 'M', 'B', 'B',
       'M', 'B', 'B', 'B', 'M', 'B', 'B', 'M', 'B', 'B', 'M', 'M', 'B',
       'B', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'M'], dtype=object)

y_test.to_numpy()

array(['M', 'B', 'B', 'M', 'B', 'M', 'M', 'M', 'B', 'B', 'B', 'B', 'M',
       'B', 'M', 'M', 'B', 'B', 'M', 'B', 'M', 'B', 'B', 'M', 'M', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B',
       'M', 'M', 'B', 'B', 'B', 'B', 'B', 'M', 'M', 'M', 'B', 'M', 'M',
       'M', 'B', 'B', 'B', 'M', 'M', 'B', 'B', 'B', 'M', 'M', 'B', 'B',
       'M', 'M', 'B', 'B', 'B', 'B', 'B', 'M', 'M', 'M', 'M', 'B', 'B',
       'M', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'M', 'M', 'M',
       'B', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'M',
       'B', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'M'], dtype=object)

# Hacemos predicción en testing para obtener las probabilidades
y_pred_proba = model_KNN.predict_proba(X_test)[:,1]

# Mostramos las predicciones
y_pred_proba.shape

(114,)

y_pred_proba

array([0.66666667, 0.        , 0.        , 1.        , 0.        ,
       1.        , 1.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 0.66666667, 0.        , 1.        ,
       0.        , 0.        , 0.66666667, 1.        , 0.        ,
       1.        , 0.        , 0.        , 0.66666667, 1.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 1.        ,
       1.        , 0.33333333, 0.        , 0.        , 0.        ,
       0.        , 0.66666667, 1.        , 1.        , 0.        ,
       1.        , 1.        , 1.        , 0.        , 0.        ,
       0.        , 0.33333333, 1.        , 0.        , 0.66666667,
       0.        , 1.        , 1.        , 0.        , 0.        ,
       1.        , 0.66666667, 0.        , 0.        , 0.        ,
       0.        , 0.        , 1.        , 0.        , 1.        ,
       0.66666667, 0.        , 0.        , 0.66666667, 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       1.        , 0.        , 0.        , 1.        , 1.        ,
       0.        , 0.        , 1.        , 1.        , 0.        ,
       0.        , 0.33333333, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.33333333, 0.        ,
       0.        , 1.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 0.33333333, 1.        ])

Métricas#

from sklearn.metrics import confusion_matrix
# Matriz de confusión
confusion_matrix(y_test,y_pred)

array([[69,  3],
       [ 5, 37]])

from sklearn.metrics import accuracy_score
# Accuracy
accuracy_s = accuracy_score(y_test,y_pred)
print('accuracy_score: {0:.4f}'.format(accuracy_s))

accuracy_score: 0.9298

Métricas bonitas#

# Matriz de confusión
from yellowbrick.classifier import confusion_matrix as  cm
model = KNeighborsClassifier(n_neighbors=3)
visualizer_cm = cm(model, X_train, y_train, X_test, y_test)

_images/397c958dca8087b851ebb4d54286a288faff5baa8bf2300d2a078d35efa7143a.png

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(list(np.unique(np.array(y_train)))) #['B', 'M']
y_train_coded = le.transform(y_train)
y_test_coded = le.transform(y_test)

# Reporte de clasificación
model = KNeighborsClassifier(n_neighbors=3)

from yellowbrick.classifier import classification_report as cr
visualizer_cr = cr(model, X_train, y_train_coded, X_test, y_test_coded, classes=list(np.unique(np.array(y_train))), support=True)

_images/12c55ef3b2d9adb050a44f4b135fc1e906efb2b477a2770566b6f6450db8e742.png

# Error de predicción por clase
from yellowbrick.classifier import class_prediction_error
model = KNeighborsClassifier(n_neighbors=3)
visualizer_pe = class_prediction_error(model,X_train, y_train, X_test, y_test)

_images/0c0ec69ba5c873d474b54ff622d80d3bbd2d85119550dfd8d8ef49aa20899092.png

KNN con preprocesamiento de las caracteristicas#

Data sin preprocesamiento de features#

from sklearn import preprocessing
# Cargamos el modelo KNN sin entrenar
model_KNN = KNeighborsClassifier(n_neighbors=3)
# Entrenamos el modelo KNN
model_KNN.fit(X_train, y_train)
# Obtenemos la métrica lograda
print(model_KNN.score(X_test, y_test))
# Definir preprocesamiento
standard_scaler = preprocessing.StandardScaler()
# Preprocesar los datos
X_train_standard = standard_scaler.fit_transform(X_train)
X_test_standard = standard_scaler.transform(X_test)
# Cargamos el modelo KNN sin entrenar
model_KNN = KNeighborsClassifier(n_neighbors=3)
# Entrenamos el modelo KNN
model_KNN.fit(X_train_standard, y_train)
# Obtenemos la métrica lograda
print(model_KNN.score(X_test_standard, y_test))
# Definir preprocesamiento
min_max_scaler = preprocessing.MinMaxScaler()
# Preprocesar los datos
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_test_minmax = min_max_scaler.transform(X_test)
# Cargamos el modelo KNN sin entrenar
model_KNN = KNeighborsClassifier(n_neighbors=3)
# Entrenamos el modelo KNN
model_KNN.fit(X_train_minmax, y_train)
# Obtenemos la métrica lograda
model_KNN.score(X_test_minmax, y_test)

0.9298245614035088
0.9649122807017544

0.9385964912280702

Balance de clases.#

# Balance de clases hacia la clase mayor
from imblearn.over_sampling import RandomOverSampler

sampler = RandomOverSampler(random_state=1)
X_train_balanced, y_train_balanced = sampler.fit_resample(X_train, y_train)

print("y_test  original: ",np.unique(y_test, return_counts=True))
print("y_train original: ",np.unique(y_train, return_counts=True))
print("y_train balanced: ",np.unique(y_train_balanced, return_counts=True))

y_test  original:  (array(['B', 'M'], dtype=object), array([72, 42]))
y_train original:  (array(['B', 'M'], dtype=object), array([285, 170]))
y_train balanced:  (array(['B', 'M'], dtype=object), array([285, 285]))

# Balance de clases hacia la clase mayor
from imblearn.over_sampling import SMOTE

sampler = SMOTE(random_state=1)
X_train_balanced, y_train_balanced = sampler.fit_resample(X_train, y_train)

print("y_test  original: ",np.unique(y_test, return_counts=True))
print("y_train original: ",np.unique(y_train, return_counts=True))
print("y_train balanced: ",np.unique(y_train_balanced, return_counts=True))

y_test  original:  (array(['B', 'M'], dtype=object), array([72, 42]))
y_train original:  (array(['B', 'M'], dtype=object), array([285, 170]))
y_train balanced:  (array(['B', 'M'], dtype=object), array([285, 285]))

# Balance de clases hacia la clase mayor
from imblearn.over_sampling import ADASYN

sampler = ADASYN(random_state=1)
X_train_balanced, y_train_balanced = sampler.fit_resample(X_train, y_train)

print("y_test  original: ",np.unique(y_test, return_counts=True))
print("y_train original: ",np.unique(y_train, return_counts=True))
print("y_train balanced: ",np.unique(y_train_balanced, return_counts=True))

y_test  original:  (array(['B', 'M'], dtype=object), array([72, 42]))
y_train original:  (array(['B', 'M'], dtype=object), array([285, 170]))
y_train balanced:  (array(['B', 'M'], dtype=object), array([285, 293]))

# Balance de clases hacia la clase mayor
from imblearn.over_sampling import BorderlineSMOTE

sampler = BorderlineSMOTE(random_state=1)
X_train_balanced, y_train_balanced = sampler.fit_resample(X_train, y_train)

print("y_test  original: ",np.unique(y_test, return_counts=True))
print("y_train original: ",np.unique(y_train, return_counts=True))
print("y_train balanced: ",np.unique(y_train_balanced, return_counts=True))

y_test  original:  (array(['B', 'M'], dtype=object), array([72, 42]))
y_train original:  (array(['B', 'M'], dtype=object), array([285, 170]))
y_train balanced:  (array(['B', 'M'], dtype=object), array([285, 285]))

# Balance de clases hacia la clase mayor
from imblearn.over_sampling import KMeansSMOTE

sampler = KMeansSMOTE(random_state=1)
X_train_balanced, y_train_balanced = sampler.fit_resample(X_train, y_train)

print("y_test  original: ",np.unique(y_test, return_counts=True))
print("y_train original: ",np.unique(y_train, return_counts=True))
print("y_train balanced: ",np.unique(y_train_balanced, return_counts=True))

/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

y_test  original:  (array(['B', 'M'], dtype=object), array([72, 42]))
y_train original:  (array(['B', 'M'], dtype=object), array([285, 170]))
y_train balanced:  (array(['B', 'M'], dtype=object), array([285, 290]))

# Balance de clases hacia la clase mayor
from imblearn.over_sampling import SVMSMOTE

sampler = SVMSMOTE(random_state=1)
X_train_balanced, y_train_balanced = sampler.fit_resample(X_train, y_train)

print("y_test  original: ",np.unique(y_test, return_counts=True))
print("y_train original: ",np.unique(y_train, return_counts=True))
print("y_train balanced: ",np.unique(y_train_balanced, return_counts=True))

y_test  original:  (array(['B', 'M'], dtype=object), array([72, 42]))
y_train original:  (array(['B', 'M'], dtype=object), array([285, 170]))
y_train balanced:  (array(['B', 'M'], dtype=object), array([285, 285]))

# Balance de clases hacia la clase mayor
from imblearn.over_sampling import SMOTEN

sampler = SMOTEN(random_state=1)
X_train_balanced, y_train_balanced = sampler.fit_resample(X_train, y_train)

print("y_test  original: ",np.unique(y_test, return_counts=True))
print("y_train original: ",np.unique(y_train, return_counts=True))
print("y_train balanced: ",np.unique(y_train_balanced, return_counts=True))

y_test  original:  (array(['B', 'M'], dtype=object), array([72, 42]))
y_train original:  (array(['B', 'M'], dtype=object), array([285, 170]))
y_train balanced:  (array(['B', 'M'], dtype=object), array([285, 285]))

# Balance de clases hacia la clase menor
from imblearn.under_sampling import RandomUnderSampler

sampler = RandomUnderSampler(random_state=1)
X_train_balanced, y_train_balanced = sampler.fit_resample(X_train, y_train)

print("y_test  original: ",np.unique(y_test, return_counts=True))
print("y_train original: ",np.unique(y_train, return_counts=True))
print("y_train balanced: ",np.unique(y_train_balanced, return_counts=True))

y_test  original:  (array(['B', 'M'], dtype=object), array([72, 42]))
y_train original:  (array(['B', 'M'], dtype=object), array([285, 170]))
y_train balanced:  (array(['B', 'M'], dtype=object), array([170, 170]))

# Balance de clases hacia la clase menor
from imblearn.under_sampling import ClusterCentroids

sampler = ClusterCentroids(random_state=1)
X_train_balanced, y_train_balanced = sampler.fit_resample(X_train, y_train)

print("y_test  original: ",np.unique(y_test, return_counts=True))
print("y_train original: ",np.unique(y_train, return_counts=True))
print("y_train balanced: ",np.unique(y_train_balanced, return_counts=True))

/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

y_test  original:  (array(['B', 'M'], dtype=object), array([72, 42]))
y_train original:  (array(['B', 'M'], dtype=object), array([285, 170]))
y_train balanced:  (array(['B', 'M'], dtype=object), array([170, 170]))

# Balance de clases hacia la clase menor
from imblearn.under_sampling import NearMiss
sampler = NearMiss()
X_train_balanced, y_train_balanced = sampler.fit_resample(X_train, y_train)

print("y_test  original: ",np.unique(y_test, return_counts=True))
print("y_train original: ",np.unique(y_train, return_counts=True))
print("y_train balanced: ",np.unique(y_train_balanced, return_counts=True))

y_test  original:  (array(['B', 'M'], dtype=object), array([72, 42]))
y_train original:  (array(['B', 'M'], dtype=object), array([285, 170]))
y_train balanced:  (array(['B', 'M'], dtype=object), array([170, 170]))