Answer To: SIT743 Multivariate and Categorical Data Analysis Assignment-2 Total Marks = 100, Weighting - 40%...
Abr Writing answered on Sep 25 2021
assignment.R
library(bnlearn)
# source("https://bioconductor.org/biocLite.R")
# biocLite("RBGL")
library(RBGL)
library(gRbase)
library(gRain)
# biocLite("Rgraphviz")
library(Rgraphviz)
library(visNetwork)## Problem 1.5bn <- model2network("[J][W|J][P|W][S|J][A][V|A:S][D|V][U|P:D][M|U]")
plot(bn)### a)dsep(bn, "W", "V")### b)dsep(bn, "A", "M", c("D", "W"))### c)dsep(bn, "A", "D", "V")dsep(bn, "W", "D", "V")
### Problem 2.1.alh <- c("low", "high")
lh4 <- c("low", "LM", "UM", "high")
e <- cptable(~eh, values=c(30, 70),levels=lh)
o.e <- cptable(~oil|eh, values=c(30, 70, 60, 40), levels=lh)
b.e <- cptable(~bp|eh, values=c(20, 45, 30, 05, 20, 50, 20, 10), levels=lh4)
i.oe <- cptable(~inf|oil:eh, values=c(10, 90, 60, 40, 50, 50, 40, 60), levels=lh)
r.ie <- cptable(~rt|inf:eh, values=c(20, 80, 60, 40, 70, 30, 20, 80), levels=lh)
plist <- compileCPT(list(e, o.e, b.e, i.oe, r.ie))
plistnet <- grain(plist)
net### Problem 2.1.bplist$eh
plist$oilplist$bpplist$infplist$rt
## Problem 2.2### Problem 2.2.a
querygrain(setEvidence(net,
evidence = list(
oil="low",
rt="low"
)),
nodes = c("bp"),
type = "marginal")
### Problem 2.2.b
querygrain(setEvidence(net,
evidence = list(
inf="low"
)),
nodes = c("rt"),
type = "marginal")
### Problem 2.2.c
querygrain(net, nodes = c("inf", "bp"), type="marginal")
### Problem 2.2.d
querygrain(net, nodes = c("inf", "bp"), type="joint")
# Problem 3## Problem 3.1
bn <- model2network("[A][B][C|A:B]")
plot(bn)dsep(bn, "A", "B")
## Problem 3.2## Problem 3.3
df <- data.frame(
a = c(1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1),
b = c(1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1),
c = c(1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1),
d = c(0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1),
e = c(0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1)
)
dfcat("Alpha = ",
sum(df$a == 0)/nrow(df),
"\n",
sep = "")
cat("Beta = ",
sum(df$c == 0 & df$e == 0)/sum(df$c == 0),
"\n",
sep = "")
cat("Gamma = ",
sum(df$c == 0 & df$d == 0)/sum(df$c == 0),
"\n",
sep = "")
cat("Theta = ",
sum(df$b == 0)/nrow(df),
sep = "")
## Problem 3.4
cat("P(E=1|A=1, B=0) = ",
0.2*sum(df$c == 0 & df$e == 0)/sum(df$c == 0)+0.56,
sep="")
# Problem 4
ChildData <- read.csv(file="CHILD10k.csv", header=TRUE, sep=",")#create and plot the network structure.
modelstring = paste0(
"[BirthAsphyxia][Disease|BirthAsphyxia][LVH|Disease][DuctFlow|Disease]",
"[CardiacMixing|Disease][LungParench|Disease][LungFlow|Disease][Sick|Disease]",
"[HypDistrib|DuctFlow:CardiacMixing][HypoxiaInO2|CardiacMixing:LungParench]",
"[CO2|LungParench][ChestXray|LungParench:LungFlow][Grunting|LungParench:Sick]",
"[LVHreport|LVH][Age|Disease:Sick][LowerBodyO2|HypDistrib:HypoxiaInO2]",
"[RUQO2|HypoxiaInO2][CO2Report|CO2][XrayReport|ChestXray][GruntingReport|Grunting]")
dag = model2network(modelstring)
par(mfrow = c(1,1))
graphviz.plot(dag)
## Problem 4.1
# plot network func
# using the visNetwork package to plot the network because it looks very nice.
plot.network <- function(structure, ht = "400px"){
nodes.uniq <- unique(c(structure$arcs[,1], structure$arcs[,2]))
nodes <- data.frame(id = nodes.uniq,
label = nodes.uniq,
color = "darkturquoise",
shadow = TRUE)
edges <- data.frame(from = structure$arcs[,1],
to = structure$arcs[,2],
arrows = "to",
smooth = TRUE,
shadow = TRUE,
color = "black")
return(visNetwork(nodes, edges, height = ht, width = "100%"))
}
### Problem 4.1.a
structure.a.1 <- hc(ChildData[1:100,], score = "bic")
plot.network(structure.a.1)structure.a.2 <- hc(ChildData[1:100,], score = "bde")
plot.network(structure.a.2)
### Problem 4.1.b
structure.b.1 <- hc(ChildData[1:500,], score = "bic")
plot.network(structure.b.1)structure.b.2 <- hc(ChildData[1:500,], score = "bde")
plot.network(structure.b.2)
### Problem 4.1.c
structure.c.1 <- hc(ChildData[1:1000,], score = "bic")
plot.network(structure.c.1)structure.c.2 <- hc(ChildData[1:1000,], score = "bde")
plot.network(structure.c.2)
### Problem 4.1.d
structure.d.1 <- hc(ChildData[1:5000,], score = "bic")
plot.network(structure.d.1)structure.d.2 <- hc(ChildData[1:5000,], score = "bde")
plot.network(structure.d.2)
## Problem 4.2
cat("BIC Score for first 100 data = ",
score(structure.a.1, ChildData, type = "bic"),
"\n",
sep="")
cat("BDE Score for first 100 data = ",
score(structure.a.2, ChildData, type = "bde"),
"\n",
sep="")
cat("BIC Score for first 500 data = ",
score(structure.b.1, ChildData, type = "bic"),
"\n",
sep="")
cat("BDE Score for first 500 data = ",
score(structure.b.2, ChildData, type = "bde"),
"\n",
sep="")
cat("BIC Score for first 1000 data = ",
score(structure.c.1, ChildData, type = "bic"),
"\n",
sep="")
cat("BDE Score for first 1000 data = ",
score(structure.c.2, ChildData, type = "bde"),
"\n",
sep="")
cat("BIC Score for first 5000 data = ",
score(structure.d.1, ChildData, type = "bic"),
"\n",
sep="")
cat("BDE Score for first 5000 data = ",
score(structure.d.2, ChildData, type = "bde"),
"\n",
sep="")
## Problem 4.3### Problem 4.3.a
structure.1 <- hc(ChildData, score = "bic")
plot.network(structure.1)
cat("BIC Score = ",
score(structure.1, ChildData, type = "bic"),
"\n",
sep="")structure.2 <- hc(ChildData, score = "bde")
plot.network(structure.2)
cat("BDE Score = ",
score(structure.2, ChildData, type = "bde"),
"\n",
sep="")
### Problem 4.3.b
graphviz.compare(structure.1, dag)graphviz.compare(structure.2, dag)
### Problem 4.3.c
bn.mod <- bn.fit(structure.1, data=ChildData)
bn.mod
### Problem 4.3.d
cpquery(bn.mod,
(Disease == "Lung"),
(CO2 == "High" & LungParench == "Abnormal"))
assignment.pdf
SIT743 Multivariate and Categorical Data Analysis
Assignment 2
25/09/2019
Problem 1
Problem 1.1
Using the Chain Rule of probability, we can write:
P (J,W, S,A, P, V,D,U,M) = P (M |U)P (U |P,D)P (U |P )P (P |W )P (W |J)
P (D|V )P (V |S,A)P (S|J)P (J)P (A)
Problem 1.2
The minimum number of parameters required to fully the distribution according to the above network is
evaluated below:
M = 3× (4− 1) = 9
U = 2× 4× (3− 1) = 16
P = 2× (2− 1) = 2
W = 4× (2− 1) = 4
J = (4− 1) = 3
D = 2× (4− 1) = 6
V = 4× 4× (2− 1) = 16
A = 4− 1 = 3
S = 4× (4− 1) = 12
Therefore, the minimum number of parameters required are:
Total parameters = 9 + 16 + 2 + 4 + 3 + 6 + 16 + 3 + 12 = 71
Problem 1.3
The total number of paramters required if there no independencies among the variables is assumed:
Total parameters = 4× 4× 4× 2× 2× 4× 2× 3× 4− 1 = 24575
We can see that the number of parameters required for defining the joint probability distribution is very low
in the directed bayesian network in comparison to case were we ignore all the independencies among the
variables.
1
Problem 1.4
a) W is marginally not independent of V as can be seen from the following path:
W ← J → S → V
Any change in V affects change in S and therefore J which will finally affects W. Therefore, W in
marginally not independent of V.
b) A is conditionally independent of M given {D, W}).Since, there is two paths between these two as
shown below:
M ← U ← D ← V ← A
And,
M ← U ← P ←W ← J → S → V ← A
c) A and W are conditionally independent of D given V. The path between W and D is shown below.
W ← J → S → V → D
And the path between A and D is shown below:
D ← V ← A
Therefore, in both the cases, V Is blocking the dependence between the two.
Problem 1.5
bn <- model2network("[J][W|J][P|W][S|J][A][V|A:S][D|V][U|P:D][M|U]")
plot(bn)
2
A
D
J
M P
S
U
V
W
a)
dsep(bn, "W", "V")
[1] FALSE
b)
dsep(bn, "A", "M", c("D", "W"))
[1] TRUE
c)
dsep(bn, "A", "D", "V")
[1] TRUE
dsep(bn, "W", "D", "V")
[1] TRUE
Problem 1.6
P (M |S = 3000− 6000, V = Y es, P = Peak hour,D <= 1) = P (U |P = Peak hour,D <= 1)P (M |U)
3
Problem 2
Problem 2.1
Problem 2.1.a
lh <- c("low", "high")
lh4 <- c("low", "LM", "UM", "high")
e <- cptable(~eh, values=c(30, 70),levels=lh)
o.e <- cptable(~oil|eh, values=c(30, 70, 60, 40), levels=lh)
b.e <- cptable(~bp|eh, values=c(20, 45, 30, 05, 20, 50, 20, 10), levels=lh4)
i.oe <- cptable(~inf|oil:eh, values=c(10, 90, 60, 40, 50, 50, 40, 60), levels=lh)
r.ie <- cptable(~rt|inf:eh, values=c(20, 80, 60, 40, 70, 30, 20, 80),...