Random Generation for tabular data
This is just short note on generating random data to be use for pair-plot and correlation map. For plot style we use HEP Style
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import skewnorm
import seaborn as sns
import random
from scipy.stats import multivariate_normal as mvn
import mplhep as hep
import warnings
= 1234
random.seed(seed)'savefig.facecolor'] = "0.8"
plt.rcParams['font.size': 13})
# create first set
# create number which follow normal distribution
= np.random.normal(500, 25, size=1000)
feature_a = np.random.normal(1023, 19, size=1000)
feature_b = np.random.uniform(low=0, high=60, size=(1000,)).astype(int)
feature_d = np.random.choice(a=40, size=1000)
# create number which follow normal distribution but skew
= 1000
numValues = 100
maxValue = -5 #Negative values are left skewed, positive values are right skewed.
= skewnorm.rvs(a = skewness,loc=maxValue, size=numValues) #Skewnorm function
= random - min(random) #Shift the set so the minimum value is equal to zero.
random = random / max(random) #Standadize all the vlues between 0 and 1.
random = random * maxValue #Multiply the standardized values by the maximum value.
# create number which follow power law
= np.random.power(1.75, numValues) feature_f
# prepare to create dataframe
= feature_a.tolist()
list_1 = feature_b.tolist()
list_2 = feature_c.tolist()
list_3 = feature_d.tolist()
list_4 = feature_e.tolist()
list_5 = feature_f.tolist()
= {'feature A': list_1, 'feature B': list_2, 'feature C': list_3,
data_set_1 'feature D': list_4, 'feature E': list_5, 'feature F': list_6}
= pd.DataFrame(data_set_1)
df_set_1 15) df_set_1.head(
feature A | feature B | feature C | feature D | feature E | feature F | |
0 | 511.785879 | 992.938048 | 76.072909 | 38 | 22 | 0.857588 |
1 | 470.225608 | 992.216386 | 65.754836 | 23 | 36 | 0.948207 |
2 | 535.817674 | 1023.884676 | 74.986424 | 58 | 20 | 0.903021 |
3 | 492.183703 | 991.083252 | 58.718138 | 26 | 18 | 0.837142 |
4 | 481.985282 | 1049.521954 | 84.559789 | 48 | 18 | 0.653904 |
5 | 522.179074 | 1006.945543 | 80.584231 | 58 | 18 | 0.713595 |
6 | 521.489710 | 1038.466132 | 79.887988 | 35 | 19 | 0.706957 |
7 | 484.086912 | 1022.054896 | 77.791707 | 38 | 33 | 0.533313 |
8 | 500.392409 | 1033.150688 | 66.112721 | 36 | 33 | 0.787118 |
9 | 443.932876 | 1007.666827 | 74.728363 | 38 | 22 | 0.320662 |
10 | 528.750893 | 1038.991174 | 80.805396 | 49 | 7 | 0.650921 |
11 | 524.798651 | 1012.394720 | 74.509873 | 3 | 6 | 0.695892 |
12 | 523.833103 | 1011.046008 | 63.021237 | 16 | 4 | 0.322784 |
13 | 449.468629 | 1045.375675 | 55.890691 | 15 | 14 | 0.748238 |
14 | 491.648066 | 1006.943466 | 27.852004 | 59 | 12 | 0.624733 |
Some data plot for Table 1
= plt.subplots(nrows=2, ncols=2)
fig, ax =.95)
= "feature A",y="feature F", kind="scatter", color = "r", alpha = .5, ax=ax[0][0])
df_set_1.plot(x = "feature A",y="feature B", kind="scatter", ax = ax[0][0], color = "b", alpha = .2)
df_set_1.plot(x 0][0].set_ylabel("feature F and feature B");
= "feature B",y="feature F", kind="scatter", color = "r", alpha = .5, ax=ax[0][1])
df_set_1.plot(x = "feature B",y="feature E", kind="scatter", ax = ax[0][1], color = "b", alpha = .2)
df_set_1.plot(x 0][1].set_ylabel("feature F and feature E");
= "feature C",y="feature A", kind="scatter", color = "r", alpha = .5, ax=ax[1][0])
df_set_1.plot(x = "feature C",y="feature D", kind="scatter", ax = ax[1][0], color = "b", alpha = .2)
df_set_1.plot(x 1][0].set_ylabel("feature A and feature D");
= "feature F",y="feature A", kind="scatter", color = "r", alpha = .5, ax=ax[1][1])
df_set_1.plot(x = "feature F",y="feature C", kind="scatter", ax = ax[1][1], color = "b", alpha = .2)
df_set_1.plot(x 1][1].set_ylabel("feature A and feature C"); ax[
; sns.pairplot(df_set_1)
# create second set
# data_ = [apa_ntah]
# # create covariance matrix
# cov_matrix = np.cov(data_, bias=True)
= np.array([[1, 0.8,.7, .6],[.8,1.,.5,.5],[0.7,.5,1.,.5],[0.6,.5,.5,1]])
cov = mvn.rvs(mean = [-200.,-200.,-200.,-200.], cov=cov, size = 1000)
scores_ = pd.DataFrame(data = scores_, columns = ["feature E", "feature F", "feature H", "feature G"])
df_set_2 15) df_set_2.head(
feature E | feature F | feature H | feature G | |
0 | -199.495066 | -199.908783 | -199.196850 | -199.866409 |
1 | -202.273674 | -202.157820 | -200.138768 | -201.427624 |
2 | -200.831163 | -199.377240 | -201.399875 | -201.394457 |
3 | -200.673144 | -200.383514 | -201.184697 | -199.566227 |
4 | -200.083799 | -200.251282 | -200.132608 | -199.823847 |
5 | -201.451594 | -200.635146 | -201.021892 | -200.708970 |
6 | -199.810856 | -200.173395 | -198.696014 | -199.614389 |
7 | -201.341426 | -201.206530 | -202.029928 | -200.957953 |
8 | -200.043966 | -199.170888 | -200.119988 | -200.141972 |
9 | -198.519039 | -199.157659 | -198.907071 | -200.315265 |
10 | -198.008390 | -199.410725 | -199.420994 | -199.457605 |
11 | -200.675343 | -200.700744 | -201.040379 | -200.743832 |
12 | -201.649402 | -200.300023 | -201.838069 | -200.687691 |
13 | -198.625462 | -198.742747 | -198.052051 | -198.507148 |
14 | -199.316719 | -200.655931 | -199.272346 | -200.457617 |
Some data plot for Table 2
= plt.subplots(nrows=1, ncols=2)
fig, ax =3)
= "feature E",y="feature G", kind="scatter", color = "r", alpha = .5, ax=ax[0], label = "feature G, $corr_{feature E}$ = .6")
df_set_2.plot(x = "feature E",y="feature F", kind="scatter", ax = ax[0], color = "b", alpha = .2, label = "feature F, $corr_{feature E}$ = .6")
df_set_2.plot(x 0].set_ylabel("feature G and feature F");
= "feature F",y="feature H", kind="scatter", color = "r", alpha = .5, ax=ax[1])
df_set_2.plot(x = "feature F",y="feature G", kind="scatter", ax = ax[1], color = "b", alpha = .2)
df_set_2.plot(x 1].set_ylabel("feature G and feature H"); ax[
; sns.pairplot(df_set_2)