An example of the simplest form of UDF. A simple value input and a simple value in return.
/myhome/pigUtil.js
:
'use strict';
inc.outputSchema = 'integer:double';
function inc(rawQty) {
var qty = parseInt(rawQty, 10);
qty += 1;
return qty;
}
The Pig script registers the javascript UDF as pigUtil, loads the data, generates the final dataset and then saves the result to a CSV output.
// Register the UDF to be able to use it.
register /myhome/pigUtil.js using javascript as pigUtil;
// Load the data file.
data = LOAD '/data/my-file.csv' USING PigStorage(';') AS (
id:int,
qty:int,
title:chararray,
category:chararray
);
// Parse the data.
data_prep = FOREACH data GENERATE
id AS id,
pigUtil.inc(qty) AS qty,
title AS title,
category AS category;
// Store the result into a new location.
STORE data_prep INTO '/data/output/my-file' USING PigStorage(';', '-schema');
And now for the example handling bags with a javascript UDF. This turned out to be a bit tricky. There was no documentation in the official docs, so I just had to fiddle this out for my self.
/myhome/pigUtil.js
:
'use strict';
// Yes-yes, I know there is a SUM function inside Pig, but this is
// still a valid example.
sumCategoryQty.outputSchema = 'integer:double';
function sumCategoryQty(bag) {
var categoryTotal = 0;
for (var b in bag) {
if (bag.hasOwnProperty(b)) {
var tuple = bag[b];
if (tuple.qty) {
categoryTotal += parseInt(tuple.qty, 10);
}
}
}
return categoryTotal;
}
Same as the above, but then we group the dataset by one or more columns. The grouped dataset has is stored with the values you group on and the rest of the columns inside a bag stored into a single column.
register /myhome/pigUtil.js using javascript as pigUtil;
data = LOAD '/data/my-file.csv' USING PigStorage(';') AS (
id:int,
qty:int,
title:chararray,
category:chararray
);
// Group the dataset by category
data_group = GROUP data BY (category);
The format of the data represented in JSON after grouping:
["category":"FooCat", [{"id":123, "qty":2, "title":"Foo"}, {"id":124, "qty":1, "title":"Bar"}]],
["category":"BarCat", [{"id":234, "qty":3, "title":"Gomle"}, {"id":235, "qty":2, "title":"FooBar"}]]
Then the rest of the Pig script:
// Parse the data and use the UDF to calculate the total for each category.
data_prep = FOREACH data GENERATE
group AS category,
pigUtil.sumCategoryQty(data) AS categoryTotal,;
STORE data_prep INTO '/data/output/my-file' USING PigStorage(';', '-schema');
bagDump.outputSchema = 'word:chararray';
function bagDump(bag) {
for (var b in bag) {
if (bag.hasOwnProperty(b)) {
var tuple = bag[b];
for (var t in tuple) {
if (tuple.hasOwnProperty(t)) {
/* jshint ignore:start */
print(t + '=' + tuple[t] + ', ');
/* jshint ignore:end */
}
}
}
}
return 'done';
}