Random Generation for tabular data
This is just short note on generating random data to be use for pair-plot and correlation map. For plot style we use HEP Style
Code
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import skewnorm
import seaborn as sns
import random
from scipy.stats import multivariate_normal as mvn
import mplhep as hep
"CMS")
hep.style.use(
import warnings
'ignore')
warnings.filterwarnings(
= 1234
seed
np.random.seed(seed)
random.seed(seed)'savefig.facecolor'] = "0.8"
plt.rcParams['font.size': 13})
plt.rcParams.update({
# create first set
# create number which follow normal distribution
= np.random.normal(500, 25, size=1000)
feature_a = np.random.normal(1023, 19, size=1000)
feature_b = np.random.uniform(low=0, high=60, size=(1000,)).astype(int)
feature_d = np.random.choice(a=40, size=1000)
feature_e
# create number which follow normal distribution but skew
= 1000
numValues = 100
maxValue = -5 #Negative values are left skewed, positive values are right skewed.
skewness
= skewnorm.rvs(a = skewness,loc=maxValue, size=numValues) #Skewnorm function
random
= random - min(random) #Shift the set so the minimum value is equal to zero.
random = random / max(random) #Standadize all the vlues between 0 and 1.
random = random * maxValue #Multiply the standardized values by the maximum value.
feature_c
# create number which follow power law
= np.random.power(1.75, numValues) feature_f
Code
# prepare to create dataframe
= feature_a.tolist()
list_1 = feature_b.tolist()
list_2 = feature_c.tolist()
list_3 = feature_d.tolist()
list_4 = feature_e.tolist()
list_5 = feature_f.tolist()
list_6
= {'feature A': list_1, 'feature B': list_2, 'feature C': list_3,
data_set_1 'feature D': list_4, 'feature E': list_5, 'feature F': list_6}
= pd.DataFrame(data_set_1)
df_set_1 15) df_set_1.head(
feature A | feature B | feature C | feature D | feature E | feature F | |
---|---|---|---|---|---|---|
0 | 511.785879 | 992.938048 | 76.072909 | 38 | 22 | 0.857588 |
1 | 470.225608 | 992.216386 | 65.754836 | 23 | 36 | 0.948207 |
2 | 535.817674 | 1023.884676 | 74.986424 | 58 | 20 | 0.903021 |
3 | 492.183703 | 991.083252 | 58.718138 | 26 | 18 | 0.837142 |
4 | 481.985282 | 1049.521954 | 84.559789 | 48 | 18 | 0.653904 |
5 | 522.179074 | 1006.945543 | 80.584231 | 58 | 18 | 0.713595 |
6 | 521.489710 | 1038.466132 | 79.887988 | 35 | 19 | 0.706957 |
7 | 484.086912 | 1022.054896 | 77.791707 | 38 | 33 | 0.533313 |
8 | 500.392409 | 1033.150688 | 66.112721 | 36 | 33 | 0.787118 |
9 | 443.932876 | 1007.666827 | 74.728363 | 38 | 22 | 0.320662 |
10 | 528.750893 | 1038.991174 | 80.805396 | 49 | 7 | 0.650921 |
11 | 524.798651 | 1012.394720 | 74.509873 | 3 | 6 | 0.695892 |
12 | 523.833103 | 1011.046008 | 63.021237 | 16 | 4 | 0.322784 |
13 | 449.468629 | 1045.375675 | 55.890691 | 15 | 14 | 0.748238 |
14 | 491.648066 | 1006.943466 | 27.852004 | 59 | 12 | 0.624733 |
Some data plot for Table 1
Code
= plt.subplots(nrows=2, ncols=2)
fig, ax =.95)
plt.tight_layout(pad
= "feature A",y="feature F", kind="scatter", color = "r", alpha = .5, ax=ax[0][0])
df_set_1.plot(x = "feature A",y="feature B", kind="scatter", ax = ax[0][0], color = "b", alpha = .2)
df_set_1.plot(x 0][0].set_ylabel("feature F and feature B");
ax[
= "feature B",y="feature F", kind="scatter", color = "r", alpha = .5, ax=ax[0][1])
df_set_1.plot(x = "feature B",y="feature E", kind="scatter", ax = ax[0][1], color = "b", alpha = .2)
df_set_1.plot(x 0][1].set_ylabel("feature F and feature E");
ax[
= "feature C",y="feature A", kind="scatter", color = "r", alpha = .5, ax=ax[1][0])
df_set_1.plot(x = "feature C",y="feature D", kind="scatter", ax = ax[1][0], color = "b", alpha = .2)
df_set_1.plot(x 1][0].set_ylabel("feature A and feature D");
ax[
= "feature F",y="feature A", kind="scatter", color = "r", alpha = .5, ax=ax[1][1])
df_set_1.plot(x = "feature F",y="feature C", kind="scatter", ax = ax[1][1], color = "b", alpha = .2)
df_set_1.plot(x 1][1].set_ylabel("feature A and feature C"); ax[
Code
; sns.pairplot(df_set_1)
Code
# create second set
# data_ = [apa_ntah]
# # create covariance matrix
# cov_matrix = np.cov(data_, bias=True)
= np.array([[1, 0.8,.7, .6],[.8,1.,.5,.5],[0.7,.5,1.,.5],[0.6,.5,.5,1]])
cov = mvn.rvs(mean = [-200.,-200.,-200.,-200.], cov=cov, size = 1000)
scores_ = pd.DataFrame(data = scores_, columns = ["feature E", "feature F", "feature H", "feature G"])
df_set_2 15) df_set_2.head(
feature E | feature F | feature H | feature G | |
---|---|---|---|---|
0 | -199.495066 | -199.908783 | -199.196850 | -199.866409 |
1 | -202.273674 | -202.157820 | -200.138768 | -201.427624 |
2 | -200.831163 | -199.377240 | -201.399875 | -201.394457 |
3 | -200.673144 | -200.383514 | -201.184697 | -199.566227 |
4 | -200.083799 | -200.251282 | -200.132608 | -199.823847 |
5 | -201.451594 | -200.635146 | -201.021892 | -200.708970 |
6 | -199.810856 | -200.173395 | -198.696014 | -199.614389 |
7 | -201.341426 | -201.206530 | -202.029928 | -200.957953 |
8 | -200.043966 | -199.170888 | -200.119988 | -200.141972 |
9 | -198.519039 | -199.157659 | -198.907071 | -200.315265 |
10 | -198.008390 | -199.410725 | -199.420994 | -199.457605 |
11 | -200.675343 | -200.700744 | -201.040379 | -200.743832 |
12 | -201.649402 | -200.300023 | -201.838069 | -200.687691 |
13 | -198.625462 | -198.742747 | -198.052051 | -198.507148 |
14 | -199.316719 | -200.655931 | -199.272346 | -200.457617 |
Some data plot for Table 2
Code
= plt.subplots(nrows=1, ncols=2)
fig, ax =3)
plt.tight_layout(pad
= "feature E",y="feature G", kind="scatter", color = "r", alpha = .5, ax=ax[0], label = "feature G, $corr_{feature E}$ = .6")
df_set_2.plot(x = "feature E",y="feature F", kind="scatter", ax = ax[0], color = "b", alpha = .2, label = "feature F, $corr_{feature E}$ = .6")
df_set_2.plot(x 0].set_ylabel("feature G and feature F");
ax[
= "feature F",y="feature H", kind="scatter", color = "r", alpha = .5, ax=ax[1])
df_set_2.plot(x = "feature F",y="feature G", kind="scatter", ax = ax[1], color = "b", alpha = .2)
df_set_2.plot(x 1].set_ylabel("feature G and feature H"); ax[
Code
; sns.pairplot(df_set_2)