mesos · trulite · May 19, 2013 · May 19, 2013 · May 20, 2013 · JoshRosen
diff --git a/core/src/main/scala/spark/RDD.scala b/core/src/main/scala/spark/RDD.scala
@@ -713,6 +713,42 @@ abstract class RDD[T: ClassManifest](
     return buf.toArray
   }
 
+  /**
+   * Drop the first drop elements and then take next num elements of the RDD. This currently scans the partitions *one by one*, so
+   * it will be slow if a lot of partitions are required. In that case, use dropCollect(drop) to get the
+   * whole RDD instead.
+   */
+  def dropTake(drop: Int, num: Int): Array[T] = {
+    if (num == 0) {
+      return new Array[T](0)
+    }
+    val buf = new ArrayBuffer[T]
+    var p = 0
+    var dropped = sc.accumulator(0)
+    while (buf.size < num && p < partitions.size) {
+      val left = num - buf.size
+      val accDropped = dropped.value
+      //still in driver
+      val res = sc.runJob(this, (it: Iterator[T]) => {
+        var leftToDrop = drop - accDropped
+        while (leftToDrop > 0 && it.hasNext) {
+          it.next()
+          leftToDrop -= 1
+        }
+        //accumulate all that have been dropped here
+        dropped += drop - leftToDrop
+        //if still left to drop then don't take
+        val taken = if (leftToDrop > 0) it.take(0) else it.take(left)
+        taken.toArray
+      }, Array(p), true)
+      buf ++= res(0)
+      if (buf.size == num)
+        return buf.toArray
+      p += 1
+    }
+    return buf.toArray
+  }
+
   /**
    * Return the first element in this RDD.
    */

diff --git a/core/src/test/scala/spark/RDDSuite.scala b/core/src/test/scala/spark/RDDSuite.scala
@@ -26,6 +26,8 @@ class RDDSuite extends FunSuite with LocalSparkContext {
     assert(nums.union(nums).collect().toList === List(1, 2, 3, 4, 1, 2, 3, 4))
     assert(nums.glom().map(_.toList).collect().toList === List(List(1, 2), List(3, 4)))
     assert(nums.collect({ case i if i >= 3 => i.toString }).collect().toList === List("3", "4"))
+    assert(nums.take(2).toList === List(1, 2))
+    assert(nums.dropTake(1,2).toList === List(2, 3))
     assert(nums.keyBy(_.toString).collect().toList === List(("1", 1), ("2", 2), ("3", 3), ("4", 4)))
     val partitionSums = nums.mapPartitions(iter => Iterator(iter.reduceLeft(_ + _)))
     assert(partitionSums.collect().toList === List(3, 7))