Skip to main content

Data sampling in Python

# __author__ = 'Bayes Server'
# __version__= '0.1'

import pandas as pd
import jpype # pip install jpype1 (version 1.2.1 or later)
import jpype.imports
from jpype.types import *

classpath = "lib/bayesserver-10.8.jar" # TODO download the Bayes Server Java API, and adjust the path

# Launch the JVM
jpype.startJVM(classpath=[classpath])

import data_frame_utils as dfu

# import the Java modules
from com.bayesserver import *
from com.bayesserver.inference import *
from com.bayesserver.learning.parameters import *
from com.bayesserver.data import *
from com.bayesserver.data.sampling import *
from jpype import java, JImplements, JOverride

# Uncomment the following line and change the license key, if you are using a licensed version
# License.validate("xxx")

def value_as_text(variable, evidence):

if evidence.getEvidenceType(variable) == EvidenceType.NONE:
return "(null)"

if variable.getValueType() == VariableValueType.CONTINUOUS:
return f"{evidence.get(variable)}"
else:
return variable.getStates().get(evidence.getState(variable)).getName()


def create_network():
network = Network()
node_gender = Node("Gender", ["Female", "Male"])
network.getNodes().add(node_gender)

node_height = Node("Height", VariableValueType.CONTINUOUS)
network.getNodes().add(node_height)

node_hair_length = Node("Hair Length", ["Short", "Medium", "Long"])
network.getNodes().add(node_hair_length)

network.getLinks().add(Link(node_gender, node_height))
network.getLinks().add(Link(node_gender, node_hair_length))

# at this point the structure of the Bayesian network is fully specified

# now set the parameters

table_gender = node_gender.newDistribution().getTable()
table_gender.copyFrom([0.51, 0.49])
node_gender.setDistribution(table_gender)

table_hair_length = node_hair_length.newDistribution().getTable()
iterator_hair_length = TableIterator(table_hair_length, [node_gender, node_hair_length])
iterator_hair_length.copyFrom([0.1, 0.4, 0.5, 0.8, 0.15, 0.05])
node_hair_length.setDistribution(table_hair_length)

gaussian_height = node_height.newDistribution()
# set the mean and variance for females
gaussian_height.setMean(0, 0, 162.56)
gaussian_height.setVariance(0, 0, 50.58)

# set the mean and variance for males
gaussian_height.setMean(1, 0, 176.022)
gaussian_height.setVariance(1, 0, 50.58)

node_height.setDistribution(gaussian_height)

# check that the Bayesian network is specified correctly
network.validate(ValidationOptions())

return network

# we manually construct the network here, but it could be loaded from a file
network = create_network()
gender = network.getVariables().get("Gender")
height = network.getVariables().get("Height")
hairLength = network.getVariables().get("Hair Length")

# You can set evidence on 'fixedEvidence' if you wish to fix
# certain variables. Here we fix Gender.
fixedEvidence = DefaultEvidence(network)
fixedEvidence.setState(gender.getStates().get("Female", True))

# prepare to sample data from the Bayesian network
sampler = DataSampler(network, fixedEvidence)
options = DataSamplingOptions()

# If you want to simulate missing data, you can use the following line of code...
# options.setMissingDataProbability(0.05) # set 5% of the data to missing

random = RandomDefault(0)
sample = DefaultEvidence(network) # acts like a buffer to receive each sample

# output 100 samples

print("Gender\tHeight\tHair Length")
print("------------------------------")

for i in range(100):
try:
sampler.takeSample(sample, random, options)
print(f"{value_as_text(gender, sample)}\t{value_as_text(height, sample)}\t{value_as_text(hairLength, sample)}")
except InconsistentEvidenceException:
print("Inconsistent evidence exception was raised.")