Data Warehousing and Machine Learning

29 March 2021

Plot

Filed under: Python — Vincent Rainardi @ 6:41 am
# 2x3 box + bar plots for categorical variables:
plt.figure(figsize=(18,8))
plt.subplot(2,3,1)
sns.boxplot(data=df, x="season", y="count")
plt.subplot(2,3,2)
sns.boxplot(data=df, x="month", y="count")
plt.subplot(2,3,3)
sns.boxplot(data=df, x="weekday", y="count")
plt.subplot(2,3,4)
sns.countplot(df["season"])
plt.subplot(2,3,5)
sns.countplot(df["month"])
plt.subplot(2,3,6)
sns.countplot(df["weekday"])
plt.show()
# 1x2 bar plots, one without confidence interval line:
plt.figure(figsize=(10,3))
plt.subplot(1,2,1)
sns.barplot(data=df, x="month", y="count", hue="year")
plt.subplot(1,2,2)
sns.barplot(data=df, x="weekday", y="count", hue="year", ci=None)
plt.show()
# 3x3 pair plot of continuous variables 
plt.figure(figsize=(4,4))
sns.pairplot(df[["count", "temp", "windspeed"]])
plt.show()
# View the heat map of correlations between variables
plt.figure(figsize=(12,7))
correlation_data = df.corr()
correlation_data = round((correlation_data*100),0).astype(int)
sns.heatmap(correlation_data, annot=True, fmt="d", cmap="YlGnBu")
plt.show()
# Setting box plot title, x and y axis labels, range and format
ax = sns.boxplot(data=df, width=0.5, palette="Blues")
ax.set_title('Plot Title')
ax.set_xlabel("Label for x axis");
ax.set_ylabel("Label for y axis"); 
ax.set_yticklabels(['${:,}m'.format(int(x/1000000)) for x in ax.get_yticks().tolist()]);
ax.grid(which='major', linestyle=':', linewidth='1', color='grey');
for item in ([ax.xaxis.label, ax.yaxis.label]):
    item.set_fontsize(14);
    item.set_color("blue"); #set the x and y label to blue
    
# Draw 2 red lines showing the preferred range
ax.axhline(5000000, ls='-', linewidth='1', color='red')
ax.axhline(15000000, ls='-', linewidth='1', color='red')
ax.set_ylim(0, 80000000); #set the upper limit of y axis

Pandas

Filed under: Python — Vincent Rainardi @ 6:29 am
#Rename columns: 
df = df.rename(columns={"col1": "column1", "col2": "column2"}

#Drop columns:
df.drop(columns = ["column1", "column2"], axis=1, inplace=True)

#Unique values in each column:
df.nunique()

#Number of missing values in each column:
df.isnull().sum()

#Convert a string column to date:
df["date_column"] = pd.to_datetime(df["string_column"], dayfirst = True)

#Get the day element from a date column:
df["day"] = df["date_column"].dt.day

#Get the weekday name (e.g. Monday, Tuesday) from a date column:
df["weekday_name"] = df["date_column"].dt.day_name(locale='English')

#Group by 2 columns and get the row count:
df.groupby(["column1","column2"]).size()

#Combine 2 data frames:
df = pd.concat([df1, df2], axis=1)

Blog at WordPress.com.