canopy算法流程_求助,kmeans(Canopy)算法如何正确导入数据集
該樓層疑似違規(guī)已被系統(tǒng)折疊?隱藏此樓查看此樓
import org.slf4j.LoggerFactory
import scala.collection.mutable.HashSet
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object Canopy {
def main(args: Array[String]): Unit = {
val input="/usr/data_test.csv"
val output="data.csv"
val slices = 8
val t1 = 8.0
val t2 = 4.0
val log = LoggerFactory.getLogger("Canopy")
val conf = new SparkConf().setAppName("Canopy").setMaster("local")
val sc = new SparkContext(conf)
try {
val oldpairs=sc.textFile(input)
val pairs = oldpairs.map { line =>
val pair = line.split(",")
(pair(0), pair(1).split(",").map(_.toDouble))
}
pairs.foreach(println)
val map_centers = new HashSet[(String, Array[Double])]
val raw_center_pairs = pairs.map(v =>
(v._1, canopy_(v, map_centers, t2))).filter(a => a._2 != null).collect().toList
val center_pairs = new HashSet[(String, Array[Double])]
for (i
canopy_(raw_center_pairs(i)._2, center_pairs, t2)
}
sc.makeRDD(center_pairs.toList, 1).map { pair =>
pair._1 + pair._2.mkString(",")
}.saveAsTextFile(output)
} catch {
case e: Exception =>
log.info(e.getStackTrace.mkString("\n"))
}
sc.stop()
}
def measure(v1: Array[Double], v2: Array[Double]): Double = {
var distance = 0.0
val aa = if (v1.length < v2.length)
v1.length
else
v2.length
for (i
distance += scala.math.pow(v1(i) - v2(i), 2)
}
distance
}
def canopy_(p0: (String, Array[Double]), pair: HashSet[(String, Array[Double])], t2: Double): (String, Array[Double]) = {
if (!pair.exists(p => measure(p._2, p0._2) < t2)) {
pair += p0
p0
} else {
null
}
}
}
總結(jié)
以上是生活随笔為你收集整理的canopy算法流程_求助,kmeans(Canopy)算法如何正确导入数据集的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: python对话框代码_Python、t
- 下一篇: python 结束子线程并保证工作完成_