2015-03-03 101 views
2

在對一些稀疏向量進行聚類之後,我需要在每個聚類中找到相交向量。爲了實現這一點,我儘量減少MLlib載體如下面的例子:Spark:值reduceByKey不是成員

import org.apache.spark.SparkConf 
import org.apache.spark.SparkContext 
import org.apache.spark.mllib.clustering.KMeans 
import org.apache.spark.mllib.linalg.Vectors 

//For Sparse Vector 
import org.apache.spark.mllib.regression.LabeledPoint 
import org.apache.spark.mllib.util.MLUtils 
import org.apache.spark.rdd.RDD 
import org.apache.spark.mllib.linalg.{Vector, Vectors} 

object Recommend { 

    def main(args: Array[String]) { 
    // set up environment 
    val conf = new SparkConf() 
     .setAppName("Test") 
     .set("spark.executor.memory", "2g") 
    val sc = new SparkContext(conf) 

    // Some vectors 
    val vLen = 1800 
    val sv11: Vector = Vectors.sparse(vLen,Seq((100,1.0), (110,1.0), (120,1.0), (130, 1.0))) 
    val sv12: Vector = Vectors.sparse(vLen,Seq((100,1.0), (110,1.0), (120,1.0), (130, 1.0), (140, 1.0) )) 
    val sv13: Vector = Vectors.sparse(vLen,Seq((100,1.0), (120,1.0), (130,1.0))) 
    val sv14: Vector = Vectors.sparse(vLen,Seq((110,1.0), (130, 1.0))) 
    val sv15: Vector = Vectors.sparse(vLen,Seq((140, 1.0))) 

    val sv21: Vector = Vectors.sparse(vLen,Seq((200,1.0), (210,1.0), (220,1.0), (230, 1.0))) 
    val sv22: Vector = Vectors.sparse(vLen,Seq((200,1.0), (210,1.0), (220,1.0), (230, 1.0), (240, 1.0) )) 
    val sv23: Vector = Vectors.sparse(vLen,Seq((200,1.0), (220,1.0), (230,1.0))) 
    val sv24: Vector = Vectors.sparse(vLen,Seq((210,1.0), (230, 1.0))) 
    val sv25: Vector = Vectors.sparse(vLen,Seq((240, 1.0))) 

    val sv31: Vector = Vectors.sparse(vLen,Seq((300,1.0), (310,1.0), (320,1.0), (330, 1.0))) 
    val sv32: Vector = Vectors.sparse(vLen,Seq((300,1.0), (310,1.0), (320,1.0), (330, 1.0), (340, 1.0) )) 
    val sv33: Vector = Vectors.sparse(vLen,Seq((300,1.0), (320,1.0), (330,1.0))) 
    val sv34: Vector = Vectors.sparse(vLen,Seq((310,1.0), (330, 1.0))) 
    val sv35: Vector = Vectors.sparse(vLen,Seq((340, 1.0))) 

    val sparseData = sc.parallelize(Seq(
     sv11, sv12, sv13, sv14, sv15, 
     sv21, sv22, sv23, sv24, sv25, 
     sv31, sv32, sv33, sv34, sv35 
     )) 

    // Cluster the data into two classes using KMeans 
    val numClusters = 3 
    val numIterations = 20 

    test(numClusters, numIterations, sparseData) 
    } 

    def test(numClusters:Int, numIterations:Int, 
     data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector]) = { 

    val clusters = KMeans.train(data, numClusters, numIterations) 

    val predictions = data.map(v => (clusters.predict(v), v)) 

    predictions.reduceByKey((v1, v2) => v1) 

    } 
} 

predictions.reduceByKey((v1, v2) => v1)導致錯誤:

value reduceByKey is not a member of org.apache.spark.rdd.RDD[(Int, org.apache.spark.mllib.linalg.Vector)] 

是什麼原因呢?

+1

可能重複http://stackoverflow.com/questions/23943852/reducebykey-method-not-being-found-in-scala-spark ) – 2015-03-03 14:13:15

+0

感謝您的補救)) – zork 2015-03-03 14:24:41

回答

1

您的代碼應該有,因爲你已經猜到了,這個進口補充說:

import org.apache.spark.SparkContext._ 

爲什麼?因爲它帶來了一些隱含的轉換,主要的重要(對於你的情況)是隱式轉換。 Spark會猜測當你有的Tuple時,左側可以被認爲是關鍵,因此可以讓你訪問一些方便的轉換或動作,如reduceByKey

問候,

[reduceByKey方法Scala中火花未找到(的
相關問題