# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
# directly shows graph without plt.show()


a = np.array(['one', 2, 3, 4])
print(type(a))

<class 'numpy.ndarray'>


# homogenous
a = np.array([2, 3, 4])
print(a)
print("Number of axes (dimensions) of a: ", a.ndim)
print("Dimensionlity of a (if there is only 1 row, the output will ignore it): ", a.shape)
print("Number of elements in a: ", a.size)
print("Data type in a: ", a.dtype)

[2 3 4]
Number of axes (dimensions) of a:  1
Dimensionlity of a (if there is only 1 row, the output will ignore it):  (3,)
Number of elements in a:  3
Data type in a:  int64


# mixed
b = np.array([1.2, 3.5, "blue"])
print("Number of axes (dimensions) of b: ", b.ndim)
print("Dimensionlity of b (if there is only 1 row, the output will ignore it): ", b.shape)
print("Number of elements in b: ", b.size)
print("Data types in b: ", b.dtype) # mixed data type in an array

Number of axes (dimensions) of b:  1
Dimensionlity of b (if there is only 1 row, the output will ignore it):  (3,)
Number of elements in b:  3
Data types in b:  <U32


#homogenous
c = np.array([(1.5, 2, 3), 
              (4, 5, 6)])
print(c)
print("Number of axes (dimensions) of c: ", c.ndim)
print("Dimensionlity of c: ", c.shape)
print("Number of elements in c: ", c.size)
print("Data type in c: ", c.dtype)

[[1.5 2.  3. ]
 [4.  5.  6. ]]
Number of axes (dimensions) of c:  2
Dimensionlity of c:  (2, 3)
Number of elements in c:  6
Data type in c:  float64


# specify the type to complex number
d = np.array([[1, 2], 
              [3, 4]], 
              dtype = complex)
print(d)
print("Number of axes (dimensions) of c: ", d.ndim)
print("Dimensionlity of c (row x col): ", d.shape)

[[1.+0.j 2.+0.j]
 [3.+0.j 4.+0.j]]
Number of axes (dimensions) of c:  2
Dimensionlity of c (row x col):  (2, 2)


# if you are want to create n-D arrays (n>1) with mixed data type
# you must specify the dtype to "object"
e = np.array([["hi",1,2,3],
              ["bye",4,5,6]],
              dtype = object)


# Write your code here!


np.zeros((3, 4))

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])


np.ones((2, 3, 4), dtype = np.int16)

array([[[1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1]],

       [[1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1]]], dtype=int16)


np.empty((2, 3))

array([[0.e+000, 0.e+000, 5.e-324],
       [0.e+000, 0.e+000, 5.e-324]])


# np.arange(start, end, increment)
# the end position is always not included
np.arange(10, 31, 5)

array([10, 15, 20, 25, 30])


np.arange(0, 2, 0.3)  # it accepts float arguments

array([0. , 0.3, 0.6, 0.9, 1.2, 1.5, 1.8])


# (you should end up with the output below)

array([  0,  10,  20,  30,  40,  50,  60,  70,  80,  90, 100])


a = np.array([20, 30, 40, 50])
print(a)
b = np.arange(4)
print(b)
c = a - b
print(c)

[20 30 40 50]
[0 1 2 3]
[20 29 38 47]


# squaring
b = np.arange(4)
print(b)
b**2

[0 1 2 3]

array([0, 1, 4, 9])


# sin functions
10 * np.sin(a)

array([ 9.12945251, -9.88031624,  7.4511316 , -2.62374854])


# run a logical statement through an array
a < 35

array([ True,  True, False, False])


# Mount the Google Drive to your computer
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# Connect to Google Cloud SDK
# Tools and libraries for interacting with Google Cloud products and services.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


# The shareable link to our csv file, 
# "https://drive.google.com/file/d/1PTzHOzz_ecK0aeyn2ZyCBG-S8IyrqUsr/view?usp=sharing"
id = "1PTzHOzz_ecK0aeyn2ZyCBG-S8IyrqUsr" 
downloaded = drive.CreateFile({'id': id}) 
downloaded.GetContentFile('cars.csv')


cars = pd.read_csv('cars.csv') # --> reads the csv, turns it into Pandas DataFrame
# for full DataFrame
# cars


# for first 5 rows in the DataFrame
cars.head()


type(cars)

pandas.core.frame.DataFrame


array1 = np.array([["Fruit","Cost","Number of Items"],
                   ["Apple",1,5],
                   ["Pear",2,6],
                   ["Berry",3,7]])
print(array1)

[['Fruit' 'Cost' 'Number of Items']
 ['Apple' '1' '5']
 ['Pear' '2' '6']
 ['Berry' '3' '7']]


# make array into table-like dataframe with the pd.DataFrame() constructor
df = pd.DataFrame(array1)
print(df)

       0     1                2
0  Fruit  Cost  Number of Items
1  Apple     1                5
2   Pear     2                6
3  Berry     3                7


# change first row into column name
df.columns = df.iloc[0]
# drop the first row, so it won't appear twice
df = df[1:]
df


# .head() lets you look at the first 5 rows of the data by default
# but you can also pass in an arg to change the rows of data you want to view
cars.head()


# 1. Rename column "Make" into "Brands" for more clarity
cars = cars.rename({"Make":'Brands'}, axis=1)
# axis = 1 refers to the column names, whereas axis = 0 refers to the row indices

# 2. Unify all column names to capitalized words
cars.columns = cars.columns.str.capitalize()
cars.head()


cars.columns = cars.columns.str.capitalize()


#Tuple: (Rows, Columns)
cars.shape

(53, 17)


cars.columns

Index(['Year', 'Brands', 'Model', 'Size', '(kw)', 'Unnamed: 5', 'Type',
       'City (kwh/100 km)', 'Hwy (kwh/100 km)', 'Comb (kwh/100 km)',
       'City (le/100 km)', 'Hwy (le/100 km)', 'Comb (le/100 km)', '(g/km)',
       'Rating', '(km)', 'Time (h)'],
      dtype='object')


cars.dtypes

Year                   int64
Make                  object
Model                 object
Size                  object
(kw)                   int64
Unnamed: 5            object
Type                  object
City (kwh/100 km)    float64
Hwy (kwh/100 km)     float64
Comb (kwh/100 km)    float64
City (le/100 km)     float64
Hwy (le/100 km)      float64
Comb (le/100 km)     float64
(g/km)                 int64
Rating               float64
(km)                   int64
Time (h)               int64
dtype: object


#print(cars)
print(cars.index) # index for the rows, numerals
print(cars.columns) # index for the columns, which are labels in text

RangeIndex(start=0, stop=53, step=1)
Index(['Year', 'Make', 'Model', 'Size', '(kw)', 'Unnamed: 5', 'Type',
       'City (kwh/100 km)', 'Hwy (kwh/100 km)', 'Comb (kwh/100 km)',
       'City (le/100 km)', 'Hwy (le/100 km)', 'Comb (le/100 km)', '(g/km)',
       'Rating', '(km)', 'Time (h)'],
      dtype='object')


# lets get ONLY the 'Year' and 'Brands' information
cars[['Year','Brands']]


# you should get the output below


cars.head()


# .nunique() --> Check the number of unique values in "Unnamed: 5"
cars["Unnamed: 5"].nunique()
# .unique() --> Get the unique value(s) in "Unnamed: 5"
cars["Unnamed: 5"].unique()

array(['A1'], dtype=object)


# .value_counts() --> Count number of rows with each unique value of variable
print(cars["Unnamed: 5"].value_counts())

A1    53
Name: Unnamed: 5, dtype: int64


# See the number of unique values in each column for the whole dataframe
print(cars.nunique())

# when the sample size is small, 
# it is very likely that that the dataset does not have all the categories of a specific column.

Year                  5
Make                  8
Model                23
Size                  7
(kw)                 15
Unnamed: 5            1
Type                  1
City (kwh/100 km)    19
Hwy (kwh/100 km)     19
Comb (kwh/100 km)    19
City (le/100 km)     10
Hwy (le/100 km)       6
Comb (le/100 km)      8
(g/km)                1
Rating                1
(km)                 19
Time (h)              7
dtype: int64


cars.columns

Index(['Year', 'Make', 'Model', 'Size', '(kw)', 'Unnamed: 5', 'Type',
       'City (kwh/100 km)', 'Hwy (kwh/100 km)', 'Comb (kwh/100 km)',
       'City (le/100 km)', 'Hwy (le/100 km)', 'Comb (le/100 km)', '(g/km)',
       'Rating', '(km)', 'Time (h)'],
      dtype='object')


# Suppose we decided to drop the column "Unnamed: 5"
# because it only contain one unique value
cars.drop(columns = ["Unnamed: 5"], inplace = True)
# inplace = True means that we directly make changes to the specified object, without creating new objects


cars.columns

Index(['Year', 'Brands', 'Model', 'Size', '(kw)', 'Type', 'City (kwh/100 km)',
       'Hwy (kwh/100 km)', 'Comb (kwh/100 km)', 'City (le/100 km)',
       'Hwy (le/100 km)', 'Comb (le/100 km)', '(g/km)', 'Rating', '(km)',
       'Time (h)'],
      dtype='object')


# previously, we have 17 columns
# and the shape of our data frame was 53 rows X 17 columns
# use the .shape method to see the dimensionality now
cars.shape

(53, 16)


#cars.isna()
#cars.head().isna()


cars.sum() 
# not every sum makes sense

Year                                                            106781
Brands               MITSUBISHINISSANFORDMITSUBISHINISSANSMARTSMART...
Model                i-MiEVLEAFFOCUS ELECTRICi-MiEVLEAFFORTWO ELECT...
Size                 SUBCOMPACTMID-SIZECOMPACTSUBCOMPACTMID-SIZETWO...
(kw)                                                             10103
Type                 BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB...
City (kwh/100 km)                                               1041.3
Hwy (kwh/100 km)                                                1146.6
Comb (kwh/100 km)                                               1088.7
City (le/100 km)                                                   117
Hwy (le/100 km)                                                  128.4
Comb (le/100 km)                                                   122
(g/km)                                                               0
Rating                                                             190
(km)                                                             12676
Time (h)                                                           449
dtype: object


# However, combining isna with sum, we could get the number of missing values
cars.isna().sum()

Year                  0
Brands                0
Model                 0
Size                  0
(kw)                  0
Type                  0
City (kwh/100 km)     0
Hwy (kwh/100 km)      0
Comb (kwh/100 km)     0
City (le/100 km)      0
Hwy (le/100 km)       0
Comb (le/100 km)      0
(g/km)                0
Rating               34
(km)                  0
Time (h)              0
dtype: int64


cars.head()


# summary statistics (for numerical values)
cars.describe()


# Year
cars["Year"].value_counts()

2016    19
2015    14
2014     9
2013     9
2012     2
Name: Year, dtype: int64


# Brands
cars["Brands"].value_counts()

TESLA         23
SMART          8
NISSAN         6
MITSUBISHI     5
FORD           4
CHEVROLET      3
BMW            2
KIA            2
Name: Brands, dtype: int64


# you should get the output below

Year  Brands      Model                            Size                   (kw)  Type  City (kwh/100 km)  Hwy (kwh/100 km)  Comb (kwh/100 km)  City (le/100 km)  Hwy (le/100 km)  Comb (le/100 km)  (g/km)  Rating  (km)  Time (h)
2016  TESLA       MODEL X P90D                     SUV - STANDARD         568   B     23.6               23.3              23.5               2.7               2.6              2.6               0       10.0    402   12          1
      SMART       FORTWO ELECTRIC DRIVE COUPE      TWO-SEATER             35    B     17.2               22.5              19.6               1.9               2.5              2.2               0       10.0    109   8           1
      CHEVROLET   SPARK EV                         SUBCOMPACT             104   B     16.0               19.6              17.8               1.8               2.2              2.0               0       10.0    131   7           1
      FORD        FOCUS ELECTRIC                   COMPACT                107   B     19.0               21.1              20.0               2.1               2.4              2.2               0       10.0    122   4           1
      KIA         SOUL EV                          STATION WAGON - SMALL  81    B     17.5               22.7              19.9               2.0               2.6              2.2               0       10.0    149   4           1
      MITSUBISHI  i-MiEV                           SUBCOMPACT             49    B     16.9               21.4              18.7               1.9               2.4              2.1               0       10.0    100   7           1
      NISSAN      LEAF (24 kWh battery)            MID-SIZE               80    B     16.5               20.8              18.4               1.9               2.3              2.1               0       10.0    135   5           1
                  LEAF (30 kWh battery)            MID-SIZE               80    B     17.0               20.7              18.6               1.9               2.3              2.1               0       10.0    172   6           1
      SMART       FORTWO ELECTRIC DRIVE CABRIOLET  TWO-SEATER             35    B     17.2               22.5              19.6               1.9               2.5              2.2               0       10.0    109   8           1
      TESLA       MODEL S (60 kWh battery)         FULL-SIZE              283   B     22.2               21.7              21.9               2.5               2.4              2.5               0       10.0    335   10          1
                  MODEL X 90D                      SUV - STANDARD         386   B     23.2               22.2              22.7               2.6               2.5              2.6               0       10.0    414   12          1
                  MODEL S (70 kWh battery)         FULL-SIZE              283   B     23.8               23.2              23.6               2.7               2.6              2.6               0       10.0    377   12          1
                  MODEL S (85/90 kWh battery)      FULL-SIZE              283   B     23.8               23.2              23.6               2.7               2.6              2.6               0       10.0    426   12          1
                  MODEL S 70D                      FULL-SIZE              386   B     20.8               20.6              20.7               2.3               2.3              2.3               0       10.0    386   12          1
                  MODEL S 85D/90D                  FULL-SIZE              386   B     22.0               19.8              21.0               2.5               2.2              2.4               0       10.0    435   12          1
                  MODEL S 90D (Refresh)            FULL-SIZE              386   B     20.8               19.7              20.3               2.3               2.2              2.3               0       10.0    473   12          1
                  MODEL S P85D/P90D                FULL-SIZE              568   B     23.4               21.5              22.5               2.6               2.4              2.5               0       10.0    407   12          1
                  MODEL S P90D (Refresh)           FULL-SIZE              568   B     22.9               21.0              22.1               2.6               2.4              2.5               0       10.0    435   12          1
      BMW         i3                               SUBCOMPACT             125   B     15.2               18.8              16.8               1.7               2.1              1.9               0       10.0    130   4           1
dtype: int64


list1 = [1,2,3,4]
plt.plot(list1)
plt.ylabel('some numbers')
# because we have written %matplotlib inline 
# we don't need to write plt.show() every time
# but you could add it as well
plt.show()


# histogram
plt.hist(cars["Year"])
plt.show()


cars["Year"].value_counts()
# left column is index
# right column is values

2016    19
2015    14
2014     9
2013     9
2012     2
Name: Year, dtype: int64


# using bar plot for category variables
year = cars["Year"].value_counts().index
print(year)
values = cars["Year"].value_counts().values
print(values)

Int64Index([2016, 2015, 2014, 2013, 2012], dtype='int64')
[19 14  9  9  2]


# bar plots takes in 2 arguments
plt.bar(year, values)
plt.xlabel("Year")
plt.ylabel("Count")

Text(0, 0.5, 'Count')


fig1 = plt.pie(values)
plt.show()


# customize labels
mylabels = year
# send argument into plt.pie()
plt.pie(values, labels = mylabels)
plt.show()


# be as creative as you want, but the goal is to increase readability

# you can create list with color names, 
mycolors_names = ['lightseagreen', 'mediumpurple', 'orange', 'skyblue', 'khaki']
# or create list with their corresponding hex codes
mycolors_hex = ['#20B2AA','#9370DB','#FFA500','#87CEEB','#F0E68C']
# send argument into plt.pie()
plt.pie(values, labels = mylabels, colors = mycolors_names)
plt.show()


# add legend with plt.legend(title = "")
plt.pie(values, labels = mylabels, colors = mycolors_names)
plt.legend(title = 'Years')

<matplotlib.legend.Legend at 0x7f5e71eafc10>


# Sometimes, you want to compare different plots
# We do that by creating subplots
names = ['group_a', 'group_b', 'group_c']
values = [1, 10, 100]

plt.figure(figsize = (9, 3))
#plt.figure(figsize = (12, 3))

plt.subplot(1,3,1)
plt.bar(names, values)

plt.subplot(1,3,2)
plt.scatter(names, values)

plt.subplot(1,3,3)
plt.plot(names, values)

plt.suptitle('Categorical Plotting')

Text(0.5, 0.98, 'Categorical Plotting')


# groupby function
df = cars[["Brands", "Size", "(kw)"]].groupby(["Brands", "Size"]).mean()
df

Basic Data Types	Example	Meaning/ Usage
Integer (int)	1, 2, 3	Integer numbers
String (str)	"apple"	Text
Float (float)	5.55	Floating/ decimal point numbers
Boolean (bool)	True, False	True/False (Boolean) values

Collection Data Types	Example	Properties
List (list)	["apple", "orange", 5]	Changeable, Ordered, Duplicate allowed
Tuple (tuple)	("apple", "orange", 5)	Unchangeable, Ordered, Duplicate allowed
Set (set)	{"apple", "orange", 5}	Unchangeable, Unordered, No duplicates
Dictionary (dict)	{"apple": 5, "orange": 10}	Key:Value pairs, Unchangeable, Unordered, No duplicates

Function	Explanation
DataFrame.shape	Return a tuple representing the dimensionality of the DataFrame.
DataFrame.columns	returns the column names of the data
DataFrame.dtypes	returns the data type of each column

Pandas dtype	Corresponding Python type	Usage
object	str	Text
int64	int	Integer numbers
float64	float	Floating point numbers
bool	bool	True/False (Boolean) values
datetime64	NA	Date and time values
category	NA	Finite list of text values

method	explanation
DataFrame.isna()	Returns Boolean value for each cell indicating whether a number is a missing value (True) or not (False)
DataFrame.fillna()	Fill in the missing values with a specific method. For example backward, forward fill, mean, median, sum...
DataFrame.interpolate()	Fill in the missing values with more sophisticated math methods
DataFrame.dropna()	Drop missing values

Getting Started¶

Task 0: Installing Python Packages¶

Task 1: Understanding N-Dimensional Arrays (ndarray)¶

Recap: Previous Data Types¶

New Data Type: Arrays¶

Use NumPy to Create N-Dimensional Arrays (ndarrays)¶

Array Creation & Properties¶

Basic Operations¶

Task 2: From ndarray in NumPy to Pandas DataFrame¶

Reading / Writing Dataset¶

New Data Type: Pandas DataFrame¶

Data Parameter: ndarray, and more¶

Change a np.array into pd.DataFrame¶

Basic Data Manipulation¶

Cleaning up Column Names¶

Data attributes¶

Table Index and label¶

Get columns based on labels¶

Get columns based on index¶

Data Cleaning¶

Missing values¶

Task 3: Exploratory Data Analysis (EDA)¶

Single variable¶

Plotting & Intro to pyplot¶

Histogram¶

Bar Plot¶

Pie chart¶

Subplots¶

Multiple variables¶

More interesting visualization websites¶

Summary about Data Types¶

Exercise¶

	YEAR	Make	Model	Size	(kW)	Unnamed: 5	TYPE	CITY (kWh/100 km)	HWY (kWh/100 km)	COMB (kWh/100 km)	CITY (Le/100 km)	HWY (Le/100 km)	COMB (Le/100 km)	RATING	(km)	TIME (h)
0	2012	MITSUBISHI	i-MiEV	SUBCOMPACT	49	A1	B	16.9	21.4	18.7	1.9	2.4	2.1	NaN	100	7
1	2012	NISSAN	LEAF	MID-SIZE	80	A1	B	19.3	23.0	21.1	2.2	2.6	2.4	NaN	117	7
2	2013	FORD	FOCUS ELECTRIC	COMPACT	107	A1	B	19.0	21.1	20.0	2.1	2.4	2.2	NaN	122	4
3	2013	MITSUBISHI	i-MiEV	SUBCOMPACT	49	A1	B	16.9	21.4	18.7	1.9	2.4	2.1	NaN	100	7
4	2013	NISSAN	LEAF	MID-SIZE	80	A1	B	19.3	23.0	21.1	2.2	2.6	2.4	NaN	117	7

Function	Explanation
DataFrame.sum()	sum all the values column wise. add axis=1 if row-wise.
DataFrame.cumsum()	Perform cumulative sum column wise. add axis=1 if row-wise.
DataFrame.prod()	multiply all the values column wise. add axis=1 if row-wise.
DataFrame.cumprod()	Perform cumulative multiplication column wise. add axis=1 if row-wise.

	Year	(kw)	City (kwh/100 km)	Hwy (kwh/100 km)	Comb (kwh/100 km)	City (le/100 km)	Hwy (le/100 km)	Comb (le/100 km)	(g/km)	Rating	(km)	Time (h)
count	53.000000	53.000000	53.00000	53.000000	53.000000	53.000000	53.000000	53.000000	53.0	19.0	53.000000	53.000000
mean	2014.735849	190.622642	19.64717	21.633962	20.541509	2.207547	2.422642	2.301887	0.0	10.0	239.169811	8.471698
std	1.227113	155.526429	3.00100	1.245753	1.979455	0.344656	0.143636	0.212576	0.0	0.0	141.426352	2.991036
min	2012.000000	35.000000	15.20000	18.800000	16.800000	1.700000	2.100000	1.900000	0.0	10.0	100.000000	4.000000
25%	2014.000000	80.000000	17.00000	20.800000	18.700000	1.900000	2.300000	2.100000	0.0	10.0	117.000000	7.000000
50%	2015.000000	107.000000	19.00000	21.700000	20.000000	2.100000	2.400000	2.200000	0.0	10.0	135.000000	8.000000
75%	2016.000000	283.000000	22.40000	22.500000	22.100000	2.500000	2.500000	2.500000	0.0	10.0	402.000000	12.000000
max	2016.000000	568.000000	23.90000	23.300000	23.600000	2.700000	2.600000	2.600000	0.0	10.0	473.000000	12.000000

		(kw)
Brands	Size
BMW	SUBCOMPACT	125.000000
CHEVROLET	SUBCOMPACT	104.000000
FORD	COMPACT	107.000000
KIA	STATION WAGON - SMALL	81.000000
MITSUBISHI	SUBCOMPACT	49.000000
NISSAN	MID-SIZE	80.000000
SMART	TWO-SEATER	35.000000
TESLA	FULL-SIZE	332.952381
TESLA	SUV - STANDARD	477.000000