private double iterate(Instances inst, boolean report) throws Exception { int i; double llkold = 0.0; double llk = 0.0;
boolean ok = false; int seed = m_rseed; int restartCount = 0; while (!ok) { try { for (i = 0; i < m_max_iterations; i ) { llkold = llk; llk = E(inst, true);
if (i > 0) { if ((llk - llkold) < 1e-6) { break; } } M(inst); } ok = true; } catch (Exception ex) { } }
return llk; } 可以看到有两种迭代中止的方法,第一种是达到了m_max_iterations,第二种是llk-llkold小于阈值,llk是log likelihood的缩写,EM的目标就是最大化它,如果已经接近最优值了,所以就停止了。下面就是E: private double E(Instances inst, boolean change_weights) throws Exception {
double loglk = 0.0, sOW = 0.0;
for (int l = 0; l < inst.numInstances(); l ) {
Instance in = inst.instance(l);
loglk = in.weight() * logDensityForInstance(in); sOW = in.weight();
if (change_weights) { m_weights[l] = distributionForInstance(in); } }
// reestimate priors if (change_weights) { estimate_priors(inst); } return loglk / sOW; } 这里logDensityForInstance的代码: public double logDensityForInstance(Instance instance) throws Exception { double[] a = logJointDensitiesForInstance(instance); double max = a[Utils.maxIndex(a)]; double sum = 0.0;
for (int i = 0; i < a.length; i ) { sum = Math.exp(a[i] - max); }
return max Math.log(sum); } logJointDensitiesForInstance: public double[] logJointDensitiesForInstance(Instance inst) throws Exception {
double[] weights = logDensityPerClusterForInstance(inst); double[] priors = clusterPriors();
for (int i = 0; i < weights.length; i ) { if (priors[i] > 0) { weights[i] = Math.log(priors[i]); } else { throw new IllegalArgumentException("Cluster empty!"); } } return weights; } logDensityPerClusterForInstance: public double[] logDensityPerClusterForInstance(Instance inst) throws Exception {
int i, j; double logprob; double[] wghts = new double[m_num_clusters];
m_replaceMissing.input(inst); inst = m_replaceMissing.output();
for (i = 0; i < m_num_clusters; i ) { logprob = 0.0;
for (j = 0; j < m_num_attribs; j ) { if (!inst.isMissing(j)) { if (inst.attribute(j).isNominal()) { logprob = Math.log(m_model[i][j]. getProbability(inst.value(j))); } else { // numeric attribute logprob = logNormalDens(inst.value(j), m_modelNormal[i][j][0], m_modelNormal[i][j][1]); } } }
wghts[i] = logprob; } return wghts; } 对于离散型属性,这里计算它的概率的对数,它的概率计算很简单: public double getProbability(double da
if (m_SumOfCounts == 0) { return 0; } return (double) m_Counts[(int) da } 就是Laplace平滑后的概率,而对于连续属性: private static double m_normConst = Math.log(Math.sqrt(2 * Math.PI));
private double logNormalDens(double x, double mean, double stdDev) {
double diff = x - mean;
return -(diff * diff / (2 * stdDev * stdDev)) - m_normConst - Math.log(stdDev); } Diff就是高斯分布(正态分布)中的x-mean,而下面的那一长串就是对高斯分布的公式对数化得到的。 回到logJointDensitiesForInstance中,clusterPriors的代码如下: public double[] clusterPriors() {
double[] n = new double[m_priors.length];
System.arraycopy(m_priors, 0, n, 0, n.length); return n; } 只是复制一下,而在clusterPriors后,weights[i] =Math.log(priors[i])还是取对数后,可以展开来,也就是P(xi|zi)P(zi)的P(zi),log(P(xi|zi)P(zi))=log(P(xi|zi)) log(P(zi))。 再回到logDensityForInstance中,这里的Math.exp(a[i]-max)这个看起来奇怪,公式里没有a[i]-max,这里可以把max这个量想成要用别的a[i]组合得到的,因为sum(a[i])=1,可以参考一下Andrew Ng的lecture notes中的supervise learing第28页。其实也就是分母。 E函数中,有distributionForInstance这个函数如下: public double[] distributionForInstance(Instance instance) throws Exception {
return Utils.logs2probs(logJointDensitiesForInstance(instance)); } 函数logs2probs注释中写,将概率取自然对数后的数组转换回概率,概率的和为1。 Converts an array containing the natural logarithms of probabilities stored in a vector back into probabilities. The probabilities are assumed to sum to on private void estimate_priors(Instances inst) throws Exception {
for (int i = 0; i < m_num_clusters; i ) { m_priors[i] = 0.0; }
for (int i = 0; i < inst.numInstances(); i ) { for (int j = 0; j < m_num_clusters; j ) { m_priors[j] = inst.instance(i).weight() * m_weights[i][j]; } }
Utils.normalize(m_priors); } 计算每个类的先验概率,就是将每个样本的权重?属于某个类的概率。 下面是M函数的内容: new_estimators的代码如下: private void new_estimators() { for (int i = 0; i < m_num_clusters; i ) { for (int j = 0; j < m_num_attribs; j ) { if (m_theInstances.attribute(j).isNominal()) { m_model[i][j] = new DiscreteEstimator(m_theInstances .attribute(j).numValues(), true); } else { m_modelNormal[i][j][0] = m_modelNormal[i][j][1] = m_modelNormal[i][j][2] = 0.0; } } } } 重新初始化m_model和m_modelNormal。 for (i = 0; i < m_num_clusters; i ) { for (j = 0; j < m_num_attribs; j ) { for (l = 0; l < inst.numInstances(); l ) { Instance in = inst.instance(l); if (!in.isMissing(j)) { if (inst.attribute(j).isNominal()) { m_model[i][j].addValue(in.value(j), in.weight() * m_weights[l][i]); } else { m_modelNormal[i][j][0] = (in.value(j) * in.weight() * m_weights[l][i]); m_modelNormal[i][j][2] = in.weight() * m_weights[l][i]; m_modelNormal[i][j][1] = (in.value(j) * in.value(j) * in.weight() * m_weights[l][i]); } } } } } 这里的代码与初始化的时候是不一样的(当然不一样,一样不就没进步了吗?),如果是离散值,那么比较简单就是将属于这个簇的权重?样本权重就可以了,如果是连续值,也不复杂,记录一下就可以了,当然这还只是累记。 // calcualte mean and std deviation for numeric attributes for (j = 0; j < m_num_attribs; j ) { if (!inst.attribute(j).isNominal()) { for (i = 0; i < m_num_clusters; i ) { if (m_modelNormal[i][j][2] <= 0) { m_modelNormal[i][j][1] = Double.MAX_VALUE; // m_modelNormal[i][j][0] = 0; m_modelNormal[i][j][0] = m_minStdDev; } else {
// variance m_modelNormal[i][j][1] = (m_modelNormal[i][j][1] – (m_modelNormal[i][j][0] * m_modelNormal[i][j][0] / m_modelNormal[i][j][2])) / (m_modelNormal[i][j][2]);
if (m_modelNormal[i][j][1] < 0) { m_modelNormal[i][j][1] = 0; }
// std dev double minStdD = (m_minStdDevPerAtt != null) m_minStdDevPerAtt[j] : m_minStdDev;
m_modelNormal[i][j][1] = Math .sqrt(m_modelNormal[i][j][1]);
if ((m_modelNormal[i][j][1] <= minStdD)) { m_modelNormal[i][j][1] = inst.attributeStats(j).numericStats.stdDev; if ((m_modelNormal[i][j][1] <= minStdD)) { m_modelNormal[i][j][1] = minStdD; } } if ((m_modelNormal[i][j][1] <= 0)) { m_modelNormal[i][j][1] = m_minStdDev; } if (Double.isInfinite(m_modelNormal[i][j][1])) { m_modelNormal[i][j][1] = m_minStdDev; }
// mean m_modelNormal[i][j][0] /= m_modelNormal[i][j][2]; } } } } 这里还要继续对连续属性进行处理,因为上次只是进行了累加。如果m_NormalModel <= 0表示没有样本属于这个类,else的代码很长,其实很简单,第一个是计算方差,然后还是得到有关精度的minStdD,开方后得到标准差,再根据精度对标准差进行处理,最后是计算均值。 double CVLogLikely = -Double.MAX_VALUE; double templl, tll; boolean CVincreased = true; m_num_clusters = 1; int num_clusters = m_num_clusters; int i; Random cvr; Instances trainCopy; int numFolds = (m_theInstances.numInstances() < 10) ? m_theInstances .numInstances() : 10;
boolean ok = true; int seed = m_rseed; int restartCount = 0; 只是初始化一点内容。 CLUSTER_SEARCH: while (CVincreased) { // theInstances.stratify(10);
CVincreased = false; cvr = new Random(m_rseed); trainCopy = new Instances(m_theInstances); trainCopy.randomize(cvr); templl = 0.0; for (i = 0; i < numFolds; i ) { Instances cvTrain = trainCopy.trainCV(numFolds, i, cvr); if (num_clusters > cvTrain.numInstances()) { break CLUSTER_SEARCH; } Instances cvTest = trainCopy.testCV(numFolds, i); m_rr = new Random(seed); for (int z = 0; z < 10; z ) m_rr.nextDouble(); m_num_clusters = num_clusters; EM_Init(cvTrain); try { iterate(cvTrain, false); } catch (Exception ex) { // catch any problems - i.e. empty clusters occuring ex.printStackTrace(); seed ; restartCount ; ok = false; if (restartCount > 5) { break CLUSTER_SEARCH; } break; } try { tll = E(cvTest, false); } catch (Exception ex) { ex.printStackTrace(); seed ; restartCount ; ok = false; if (restartCount > 5) { break CLUSTER_SEARCH; } break; }
templl = tll; }
if (ok) { restartCount = 0; seed = m_rseed; templl /= (double) numFolds;
if (templl > CVLogLikely) { CVLogLikely = templl; CVincreased = true; num_clusters ; } } } trainCopy是待聚类的数据,如果没有什么意外那么numFolds等于10,trainCV和testCV相当于是cross validation中的第i次的训练和测试数据。如果要聚的类比样本还多,当然是break。再用EM_Init初始化,iterate进行迭代,然后用E在cvTest上测试得到log likelihood,如果没有遇到异常,那么求得templl看是不是比以前更小的num_clusters能取得更好的结果,如果没有取得那么整个循环就结束了。 |
|