Coloring data sets with two colors in Spyder(Python 3.9)

November 29, 2022

I have a dataset with 569 data points, each associated with two features and labelled either as 0 0r 1. Based on the label, I want to make a scatterplot graph such that data point associated with label 0 gets a green dot while the other with label 1 gets red dot on the scatterplot.I am using spyder(python 3.9).

Here is my code :
`

from sklearn.datasets import load_breast_cancer
breast=load_breast_cancer()
#print(breast)
breast_data=breast.data
#print(breast_data)
print(breast_data.shape)
breast_labels = breast.target
#print(breast_labels)
print(breast_labels.shape)
import numpy as np
labels=np.reshape(breast_labels,(569,1))
final_breast_data=np.concatenate([breast_data,labels],axis=1)
#print(final_breast_data)
print(final_breast_data.shape)

import  pandas as pd
breast_dataset=pd.DataFrame(final_breast_data)
#print(breast_dataset)
features=breast.feature_names
print(features)
features_labels=np.append(features,'label')
#print(features_labels)

breast_dataset.columns=features_labels
print(breast_dataset.head())
breast_dataset['label'].replace(0,'Benign',inplace=True)
breast_dataset['label'].replace(1,'Malignant',inplace=True)
print(breast_dataset.tail())


from sklearn.preprocessing import StandardScaler
x=breast_dataset.loc[:,features].values
x=StandardScaler().fit_transform(x)
print(x.shape)

print(np.mean(x),np.std(x))
feat_cols=['feature'+str(i) for i in range(x.shape[1]) ]
print(feat_cols)
#print(x.shape[0])
normalised_breast=pd.DataFrame(x,columns=feat_cols)
print(normalised_breast.tail())

from sklearn.decomposition import PCA
pca_breast=PCA(n_components=2)
#print(pca_breast)
principalComponents_breast=pca_breast.fit_transform(x)
#print(principalComponents_breast)
principal_breast_Df=pd.DataFrame(data=principalComponents_breast,columns=['Principal component 1','Principal component 2'])
print(principal_breast_Df.tail())
print('Explained variation per principal component:{}'.format(pca_breast.explained_variance_ratio_))

import pandas as pd 
import matplotlib.pyplot as plt
#x=principal_breast_Df['Principal component 1']
#y=principal_breast_Df['Principal component 2']
#plt.scatter(x,y)
#plt.show()
 
   
plt.figure()
plt.figure(figsize=(10,10))
plt.xticks(fontsize=12)
plt.yticks(fontsize=14)
plt.xlabel('Principal Component 1',fontsize=20)
plt.ylabel('Principal Component 2',fontsize=20)
plt.title("Principal Component Analysis of Breast Cancer Dataset",fontsize=20)
targets=['Benign','Malignant']
colors=['r','g']


for target,color in zip(targets,colors):
    indicesToKeep=breast_dataset['label']==target
    plt.scatter(principal_breast_Df.loc[indicesToKeep,'Principal componenet 1'],principal_breast_Df.loc[indicesToKeep,'Principal component 2'],c=color,s=50)
    plt.legend(targets,prop={'size':15});

I tried to print the graph as mentioned in the problem but got a blank graph with no dots ! I think that I am missing some packages that I should include. Any help would be appreciated.

>Solution :

Change the last part to:

for target,color in zip(targets,colors):
    indicesToKeep=breast_dataset['label']==target
    plt.scatter(principal_breast_Df[breast_dataset['label']==target]['Principal component 1'], principal_breast_Df[breast_dataset['label']==target]['Principal component 2'],c=color,s=50)
    plt.legend(targets,prop={'size':15});