Random Generation for tabular data
This is just short note on generating random data to be use for pair-plot and correlation map. For plot style we use HEP Style
Code
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import skewnorm
import seaborn as sns
import random
from scipy.stats import multivariate_normal as mvn
import mplhep as hep
hep.style.use("CMS")
import warnings
warnings.filterwarnings('ignore')
seed = 1234
np.random.seed(seed)
random.seed(seed)
plt.rcParams['savefig.facecolor'] = "0.8"
plt.rcParams.update({'font.size': 13})
# create first set
# create number which follow normal distribution
feature_a = np.random.normal(500, 25, size=1000)
feature_b = np.random.normal(1023, 19, size=1000)
feature_d = np.random.uniform(low=0, high=60, size=(1000,)).astype(int)
feature_e = np.random.choice(a=40, size=1000)
# create number which follow normal distribution but skew
numValues = 1000
maxValue = 100
skewness = -5 #Negative values are left skewed, positive values are right skewed.
random = skewnorm.rvs(a = skewness,loc=maxValue, size=numValues) #Skewnorm function
random = random - min(random) #Shift the set so the minimum value is equal to zero.
random = random / max(random) #Standadize all the vlues between 0 and 1.
feature_c = random * maxValue #Multiply the standardized values by the maximum value.
# create number which follow power law
feature_f = np.random.power(1.75, numValues)Code
# prepare to create dataframe
list_1 = feature_a.tolist()
list_2 = feature_b.tolist()
list_3 = feature_c.tolist()
list_4 = feature_d.tolist()
list_5 = feature_e.tolist()
list_6 = feature_f.tolist()
data_set_1 = {'feature A': list_1, 'feature B': list_2, 'feature C': list_3,
'feature D': list_4, 'feature E': list_5, 'feature F': list_6}
df_set_1 = pd.DataFrame(data_set_1)
df_set_1.head(15)| feature A | feature B | feature C | feature D | feature E | feature F | |
|---|---|---|---|---|---|---|
| 0 | 511.785879 | 992.938048 | 76.072909 | 38 | 22 | 0.857588 |
| 1 | 470.225608 | 992.216386 | 65.754836 | 23 | 36 | 0.948207 |
| 2 | 535.817674 | 1023.884676 | 74.986424 | 58 | 20 | 0.903021 |
| 3 | 492.183703 | 991.083252 | 58.718138 | 26 | 18 | 0.837142 |
| 4 | 481.985282 | 1049.521954 | 84.559789 | 48 | 18 | 0.653904 |
| 5 | 522.179074 | 1006.945543 | 80.584231 | 58 | 18 | 0.713595 |
| 6 | 521.489710 | 1038.466132 | 79.887988 | 35 | 19 | 0.706957 |
| 7 | 484.086912 | 1022.054896 | 77.791707 | 38 | 33 | 0.533313 |
| 8 | 500.392409 | 1033.150688 | 66.112721 | 36 | 33 | 0.787118 |
| 9 | 443.932876 | 1007.666827 | 74.728363 | 38 | 22 | 0.320662 |
| 10 | 528.750893 | 1038.991174 | 80.805396 | 49 | 7 | 0.650921 |
| 11 | 524.798651 | 1012.394720 | 74.509873 | 3 | 6 | 0.695892 |
| 12 | 523.833103 | 1011.046008 | 63.021237 | 16 | 4 | 0.322784 |
| 13 | 449.468629 | 1045.375675 | 55.890691 | 15 | 14 | 0.748238 |
| 14 | 491.648066 | 1006.943466 | 27.852004 | 59 | 12 | 0.624733 |
Some data plot for Table 1
Code
fig, ax = plt.subplots(nrows=2, ncols=2)
plt.tight_layout(pad=.95)
df_set_1.plot(x = "feature A",y="feature F", kind="scatter", color = "r", alpha = .5, ax=ax[0][0])
df_set_1.plot(x = "feature A",y="feature B", kind="scatter", ax = ax[0][0], color = "b", alpha = .2)
ax[0][0].set_ylabel("feature F and feature B");
df_set_1.plot(x = "feature B",y="feature F", kind="scatter", color = "r", alpha = .5, ax=ax[0][1])
df_set_1.plot(x = "feature B",y="feature E", kind="scatter", ax = ax[0][1], color = "b", alpha = .2)
ax[0][1].set_ylabel("feature F and feature E");
df_set_1.plot(x = "feature C",y="feature A", kind="scatter", color = "r", alpha = .5, ax=ax[1][0])
df_set_1.plot(x = "feature C",y="feature D", kind="scatter", ax = ax[1][0], color = "b", alpha = .2)
ax[1][0].set_ylabel("feature A and feature D");
df_set_1.plot(x = "feature F",y="feature A", kind="scatter", color = "r", alpha = .5, ax=ax[1][1])
df_set_1.plot(x = "feature F",y="feature C", kind="scatter", ax = ax[1][1], color = "b", alpha = .2)
ax[1][1].set_ylabel("feature A and feature C");Code
sns.pairplot(df_set_1);Code
# create second set
# data_ = [apa_ntah]
# # create covariance matrix
# cov_matrix = np.cov(data_, bias=True)
cov = np.array([[1, 0.8,.7, .6],[.8,1.,.5,.5],[0.7,.5,1.,.5],[0.6,.5,.5,1]])
scores_ = mvn.rvs(mean = [-200.,-200.,-200.,-200.], cov=cov, size = 1000)
df_set_2 = pd.DataFrame(data = scores_, columns = ["feature E", "feature F", "feature H", "feature G"])
df_set_2.head(15)| feature E | feature F | feature H | feature G | |
|---|---|---|---|---|
| 0 | -199.495066 | -199.908783 | -199.196850 | -199.866409 |
| 1 | -202.273674 | -202.157820 | -200.138768 | -201.427624 |
| 2 | -200.831163 | -199.377240 | -201.399875 | -201.394457 |
| 3 | -200.673144 | -200.383514 | -201.184697 | -199.566227 |
| 4 | -200.083799 | -200.251282 | -200.132608 | -199.823847 |
| 5 | -201.451594 | -200.635146 | -201.021892 | -200.708970 |
| 6 | -199.810856 | -200.173395 | -198.696014 | -199.614389 |
| 7 | -201.341426 | -201.206530 | -202.029928 | -200.957953 |
| 8 | -200.043966 | -199.170888 | -200.119988 | -200.141972 |
| 9 | -198.519039 | -199.157659 | -198.907071 | -200.315265 |
| 10 | -198.008390 | -199.410725 | -199.420994 | -199.457605 |
| 11 | -200.675343 | -200.700744 | -201.040379 | -200.743832 |
| 12 | -201.649402 | -200.300023 | -201.838069 | -200.687691 |
| 13 | -198.625462 | -198.742747 | -198.052051 | -198.507148 |
| 14 | -199.316719 | -200.655931 | -199.272346 | -200.457617 |
Some data plot for Table 2
Code
fig, ax = plt.subplots(nrows=1, ncols=2)
plt.tight_layout(pad=3)
df_set_2.plot(x = "feature E",y="feature G", kind="scatter", color = "r", alpha = .5, ax=ax[0], label = "feature G, $corr_{feature E}$ = .6")
df_set_2.plot(x = "feature E",y="feature F", kind="scatter", ax = ax[0], color = "b", alpha = .2, label = "feature F, $corr_{feature E}$ = .6")
ax[0].set_ylabel("feature G and feature F");
df_set_2.plot(x = "feature F",y="feature H", kind="scatter", color = "r", alpha = .5, ax=ax[1])
df_set_2.plot(x = "feature F",y="feature G", kind="scatter", ax = ax[1], color = "b", alpha = .2)
ax[1].set_ylabel("feature G and feature H");Code
sns.pairplot(df_set_2);