-2
嘗試使用sc.wholeTextFiles()讀取大型文本文件(文件> 4GB)。激發整個文本文件 - java.lang.OutOfMemoryError
進入java.lang.OutOfMemoryError。
>> at java.io.ByteArrayOutputStream.hugeCapacity(ByteArrayOutputStream.java:123)
>> at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:117)
>> at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
>> at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
>> at org.spark-project.guava.io.ByteStreams.copy(ByteStreams.java:211)
>> at org.spark-project.guava.io.ByteStreams.toByteArray(ByteStreams.java:252)
>> at org.apache.spark.input.WholeTextFileRecordReader.nextKeyValue(WholeTextFileRecordReader.scala:83)
>> at org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.nextKeyValue(CombineFileRecordReader.java:69)
>> at org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:143)
>> at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:39)
>> at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1467)
>> at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1006)
>> at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1006)
>> at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1498)
>> at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1498)
>> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
>> at org.apache.spark.scheduler.Task.run(Task.scala:64)
>> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
>> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
>> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
>> at java.lang.Thread.run(Thread.java:745)