Random Generation for tabular data

Random Generation for tabular data


August 3, 2023

This is just short note on generating random data to be use for pair-plot and correlation map. For plot style we use HEP Style

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import skewnorm
import seaborn as sns
import random
from scipy.stats import multivariate_normal as mvn
import mplhep as hep

import warnings

seed = 1234
plt.rcParams['savefig.facecolor'] = "0.8"
plt.rcParams.update({'font.size': 13})

# create first set
# create number which follow normal distribution
feature_a = np.random.normal(500, 25, size=1000)
feature_b = np.random.normal(1023, 19, size=1000)
feature_d = np.random.uniform(low=0, high=60, size=(1000,)).astype(int)
feature_e = np.random.choice(a=40, size=1000)

# create number which follow normal distribution but skew
numValues = 1000
maxValue = 100
skewness = -5   #Negative values are left skewed, positive values are right skewed.

random = skewnorm.rvs(a = skewness,loc=maxValue, size=numValues)  #Skewnorm function

random = random - min(random)      #Shift the set so the minimum value is equal to zero.
random = random / max(random)      #Standadize all the vlues between 0 and 1. 
feature_c = random * maxValue         #Multiply the standardized values by the maximum value.

# create number which follow power law 
feature_f = np.random.power(1.75, numValues)
# prepare to create dataframe
list_1 = feature_a.tolist()
list_2 = feature_b.tolist()
list_3 = feature_c.tolist()
list_4 = feature_d.tolist()
list_5 = feature_e.tolist()
list_6 = feature_f.tolist()

data_set_1 = {'feature A': list_1, 'feature B': list_2, 'feature C': list_3, 
              'feature D': list_4, 'feature E': list_5, 'feature F': list_6}

df_set_1 = pd.DataFrame(data_set_1)
Table 1: Random Data based on distribution
feature A feature B feature C feature D feature E feature F
0 511.785879 992.938048 76.072909 38 22 0.857588
1 470.225608 992.216386 65.754836 23 36 0.948207
2 535.817674 1023.884676 74.986424 58 20 0.903021
3 492.183703 991.083252 58.718138 26 18 0.837142
4 481.985282 1049.521954 84.559789 48 18 0.653904
5 522.179074 1006.945543 80.584231 58 18 0.713595
6 521.489710 1038.466132 79.887988 35 19 0.706957
7 484.086912 1022.054896 77.791707 38 33 0.533313
8 500.392409 1033.150688 66.112721 36 33 0.787118
9 443.932876 1007.666827 74.728363 38 22 0.320662
10 528.750893 1038.991174 80.805396 49 7 0.650921
11 524.798651 1012.394720 74.509873 3 6 0.695892
12 523.833103 1011.046008 63.021237 16 4 0.322784
13 449.468629 1045.375675 55.890691 15 14 0.748238
14 491.648066 1006.943466 27.852004 59 12 0.624733

Some data plot for Table 1

fig, ax = plt.subplots(nrows=2, ncols=2)

df_set_1.plot(x = "feature A",y="feature F", kind="scatter", color = "r", alpha = .5, ax=ax[0][0])
df_set_1.plot(x = "feature A",y="feature B", kind="scatter", ax = ax[0][0], color = "b", alpha = .2)
ax[0][0].set_ylabel("feature F and feature B");

df_set_1.plot(x = "feature B",y="feature F", kind="scatter", color = "r", alpha = .5, ax=ax[0][1])
df_set_1.plot(x = "feature B",y="feature E", kind="scatter", ax = ax[0][1], color = "b", alpha = .2)
ax[0][1].set_ylabel("feature F and feature E");

df_set_1.plot(x = "feature C",y="feature A", kind="scatter", color = "r", alpha = .5, ax=ax[1][0])
df_set_1.plot(x = "feature C",y="feature D", kind="scatter", ax = ax[1][0], color = "b", alpha = .2)
ax[1][0].set_ylabel("feature A and feature D");

df_set_1.plot(x = "feature F",y="feature A", kind="scatter", color = "r", alpha = .5, ax=ax[1][1])
df_set_1.plot(x = "feature F",y="feature C", kind="scatter", ax = ax[1][1], color = "b", alpha = .2)
ax[1][1].set_ylabel("feature A and feature C");

Figure 1: Scatter Plot for Dataset 1

Figure 2: Pair Plot for Dataset 1
# create second set
# data_ = [apa_ntah]
# # create covariance matrix
# cov_matrix = np.cov(data_, bias=True)
cov = np.array([[1, 0.8,.7, .6],[.8,1.,.5,.5],[0.7,.5,1.,.5],[0.6,.5,.5,1]])
scores_ = mvn.rvs(mean = [-200.,-200.,-200.,-200.], cov=cov, size = 1000)
df_set_2 = pd.DataFrame(data = scores_, columns = ["feature E", "feature F", "feature H", "feature G"])
Table 2: Random Data
feature E feature F feature H feature G
0 -199.495066 -199.908783 -199.196850 -199.866409
1 -202.273674 -202.157820 -200.138768 -201.427624
2 -200.831163 -199.377240 -201.399875 -201.394457
3 -200.673144 -200.383514 -201.184697 -199.566227
4 -200.083799 -200.251282 -200.132608 -199.823847
5 -201.451594 -200.635146 -201.021892 -200.708970
6 -199.810856 -200.173395 -198.696014 -199.614389
7 -201.341426 -201.206530 -202.029928 -200.957953
8 -200.043966 -199.170888 -200.119988 -200.141972
9 -198.519039 -199.157659 -198.907071 -200.315265
10 -198.008390 -199.410725 -199.420994 -199.457605
11 -200.675343 -200.700744 -201.040379 -200.743832
12 -201.649402 -200.300023 -201.838069 -200.687691
13 -198.625462 -198.742747 -198.052051 -198.507148
14 -199.316719 -200.655931 -199.272346 -200.457617

Some data plot for Table 2

fig, ax = plt.subplots(nrows=1, ncols=2)

df_set_2.plot(x = "feature E",y="feature G", kind="scatter", color = "r", alpha = .5, ax=ax[0], label = "feature G, $corr_{feature E}$ = .6")
df_set_2.plot(x = "feature E",y="feature F", kind="scatter", ax = ax[0], color = "b", alpha = .2, label = "feature F, $corr_{feature E}$ = .6")
ax[0].set_ylabel("feature G and feature F");

df_set_2.plot(x = "feature F",y="feature H", kind="scatter", color = "r", alpha = .5, ax=ax[1])
df_set_2.plot(x = "feature F",y="feature G", kind="scatter", ax = ax[1], color = "b", alpha = .2)
ax[1].set_ylabel("feature G and feature H");

Figure 3: Scatter Plot for Dataset 2

Figure 4: Scatter Plot for Dataset 2