Random Generation for tabular data

Published

August 3, 2023

This is just short note on generating random data to be use for pair-plot and correlation map. For plot style we use HEP Style

Code

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import skewnorm
import seaborn as sns
import random
from scipy.stats import multivariate_normal as mvn
import mplhep as hep
hep.style.use("CMS")

import warnings
warnings.filterwarnings('ignore')

seed = 1234
np.random.seed(seed)
random.seed(seed)
plt.rcParams['savefig.facecolor'] = "0.8"
plt.rcParams.update({'font.size': 13})


# create first set
# create number which follow normal distribution
feature_a = np.random.normal(500, 25, size=1000)
feature_b = np.random.normal(1023, 19, size=1000)
feature_d = np.random.uniform(low=0, high=60, size=(1000,)).astype(int)
feature_e = np.random.choice(a=40, size=1000)

# create number which follow normal distribution but skew
numValues = 1000
maxValue = 100
skewness = -5   #Negative values are left skewed, positive values are right skewed.

random = skewnorm.rvs(a = skewness,loc=maxValue, size=numValues)  #Skewnorm function

random = random - min(random)      #Shift the set so the minimum value is equal to zero.
random = random / max(random)      #Standadize all the vlues between 0 and 1. 
feature_c = random * maxValue         #Multiply the standardized values by the maximum value.

# create number which follow power law 
feature_f = np.random.power(1.75, numValues)

Code

# prepare to create dataframe
list_1 = feature_a.tolist()
list_2 = feature_b.tolist()
list_3 = feature_c.tolist()
list_4 = feature_d.tolist()
list_5 = feature_e.tolist()
list_6 = feature_f.tolist()

data_set_1 = {'feature A': list_1, 'feature B': list_2, 'feature C': list_3, 
              'feature D': list_4, 'feature E': list_5, 'feature F': list_6}

df_set_1 = pd.DataFrame(data_set_1)
df_set_1.head(15)

Table 1: Random Data based on distribution
	feature A	feature B	feature C	feature D	feature E	feature F
0	511.785879	992.938048	76.072909	38	22	0.857588
1	470.225608	992.216386	65.754836	23	36	0.948207
2	535.817674	1023.884676	74.986424	58	20	0.903021
3	492.183703	991.083252	58.718138	26	18	0.837142
4	481.985282	1049.521954	84.559789	48	18	0.653904
5	522.179074	1006.945543	80.584231	58	18	0.713595
6	521.489710	1038.466132	79.887988	35	19	0.706957
7	484.086912	1022.054896	77.791707	38	33	0.533313
8	500.392409	1033.150688	66.112721	36	33	0.787118
9	443.932876	1007.666827	74.728363	38	22	0.320662
10	528.750893	1038.991174	80.805396	49	7	0.650921
11	524.798651	1012.394720	74.509873	3	6	0.695892
12	523.833103	1011.046008	63.021237	16	4	0.322784
13	449.468629	1045.375675	55.890691	15	14	0.748238
14	491.648066	1006.943466	27.852004	59	12	0.624733

Some data plot for Table 1

Code

fig, ax = plt.subplots(nrows=2, ncols=2)
plt.tight_layout(pad=.95)

df_set_1.plot(x = "feature A",y="feature F", kind="scatter", color = "r", alpha = .5, ax=ax[0][0])
df_set_1.plot(x = "feature A",y="feature B", kind="scatter", ax = ax[0][0], color = "b", alpha = .2)
ax[0][0].set_ylabel("feature F and feature B");

df_set_1.plot(x = "feature B",y="feature F", kind="scatter", color = "r", alpha = .5, ax=ax[0][1])
df_set_1.plot(x = "feature B",y="feature E", kind="scatter", ax = ax[0][1], color = "b", alpha = .2)
ax[0][1].set_ylabel("feature F and feature E");

df_set_1.plot(x = "feature C",y="feature A", kind="scatter", color = "r", alpha = .5, ax=ax[1][0])
df_set_1.plot(x = "feature C",y="feature D", kind="scatter", ax = ax[1][0], color = "b", alpha = .2)
ax[1][0].set_ylabel("feature A and feature D");

df_set_1.plot(x = "feature F",y="feature A", kind="scatter", color = "r", alpha = .5, ax=ax[1][1])
df_set_1.plot(x = "feature F",y="feature C", kind="scatter", ax = ax[1][1], color = "b", alpha = .2)
ax[1][1].set_ylabel("feature A and feature C");

Code

sns.pairplot(df_set_1);

Code

# create second set
# data_ = [apa_ntah]
# # create covariance matrix
# cov_matrix = np.cov(data_, bias=True)
cov = np.array([[1, 0.8,.7, .6],[.8,1.,.5,.5],[0.7,.5,1.,.5],[0.6,.5,.5,1]])
scores_ = mvn.rvs(mean = [-200.,-200.,-200.,-200.], cov=cov, size = 1000)
df_set_2 = pd.DataFrame(data = scores_, columns = ["feature E", "feature F", "feature H", "feature G"])
df_set_2.head(15)

Table 2: Random Data
	feature E	feature F	feature H	feature G
0	-199.495066	-199.908783	-199.196850	-199.866409
1	-202.273674	-202.157820	-200.138768	-201.427624
2	-200.831163	-199.377240	-201.399875	-201.394457
3	-200.673144	-200.383514	-201.184697	-199.566227
4	-200.083799	-200.251282	-200.132608	-199.823847
5	-201.451594	-200.635146	-201.021892	-200.708970
6	-199.810856	-200.173395	-198.696014	-199.614389
7	-201.341426	-201.206530	-202.029928	-200.957953
8	-200.043966	-199.170888	-200.119988	-200.141972
9	-198.519039	-199.157659	-198.907071	-200.315265
10	-198.008390	-199.410725	-199.420994	-199.457605
11	-200.675343	-200.700744	-201.040379	-200.743832
12	-201.649402	-200.300023	-201.838069	-200.687691
13	-198.625462	-198.742747	-198.052051	-198.507148
14	-199.316719	-200.655931	-199.272346	-200.457617

Some data plot for Table 2

Code

fig, ax = plt.subplots(nrows=1, ncols=2)
plt.tight_layout(pad=3)

df_set_2.plot(x = "feature E",y="feature G", kind="scatter", color = "r", alpha = .5, ax=ax[0], label = "feature G, $corr_{feature E}$ = .6")
df_set_2.plot(x = "feature E",y="feature F", kind="scatter", ax = ax[0], color = "b", alpha = .2, label = "feature F, $corr_{feature E}$ = .6")
ax[0].set_ylabel("feature G and feature F");

df_set_2.plot(x = "feature F",y="feature H", kind="scatter", color = "r", alpha = .5, ax=ax[1])
df_set_2.plot(x = "feature F",y="feature G", kind="scatter", ax = ax[1], color = "b", alpha = .2)
ax[1].set_ylabel("feature G and feature H");

Code

sns.pairplot(df_set_2);