おそらく組み込み関数のみをトリッキーな方法で使用することで結果を得ることができますが、単純な UDF でも同じことができます。
public class SlidingTuple extends EvalFunc<DataBag> {
private static final BagFactory bagFactory = BagFactory.getInstance();
private static final TupleFactory tupleFactory = TupleFactory.getInstance();
@Override
public DataBag exec(Tuple input) throws IOException {
try {
DataBag inputBag = (DataBag) input.get(0);
DataBag result = null;
if (inputBag != null) {
result = bagFactory.newDefaultBag();
Iterator<Tuple> it = inputBag.iterator();
Tuple previous = it.next();
while (it.hasNext()) {
Tuple current = it.next();
Tuple tuple = tupleFactory.newTuple(2);
tuple.set(0, previous.get(0));
tuple.set(1, current.get(0));
result.add(tuple);
previous = current;
}
}
return result;
}
catch (Exception e) {
throw new RuntimeException("SlidingTuple error", e);
}
}
}
それで:
A = LOAD '/user/hive/warehouse/twitter_raw/$date' USING PigStorage('\t')
AS (id:chararray, mess:chararray);
B = foreach A generate TOKENIZE(mess, ' ') as words;
次に、カスタム UDF を使用します。
C = foreach B generate com.example.SlidingTuple(words);