GenericUDF的示例, 根据字符串生成词向量

Posted on 2018-05-23 18:15:00

GenericUDF的示例, 根据字符串生成词向量

GenericUDF提供了更好的参数和返回值检查, 效率更高, 适合处理HIVE中的复杂数据类型

把字符串变成词向量, 例如:

"This is a sentence"->{'This':1, 'is':1, 'a':1, 'sentence':1}

对于外部依赖, 为了让集群的每个节点都能执行jar, 可以用eclipse export Runnable Jar File

package cn.pywei.HiveUDF;

import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;

@Description(name="WordsArray",value="_FUNC_(string), return the word array by using GenericUDF.")

public class WordArray extends GenericUDF {
    private final Map<Text, IntWritable> sortMap = new HashMap<Text, IntWritable>();

    private ObjectInspectorConverters.Converter converter;

    @Override
    public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
        
        // check the input argument count
        if (arguments.length != 1) {
            throw new UDFArgumentException("Param must be 1 argu.");
        }
        
        // check the input argument type
        if (arguments[0].getCategory() != Category.PRIMITIVE) {
            throw new UDFArgumentTypeException(1, "A string argument was expected.");

        }
        
        PrimitiveCategory primitiveCategory = ((PrimitiveObjectInspector) arguments[0]).getPrimitiveCategory();
        if (primitiveCategory != PrimitiveCategory.STRING
                  && primitiveCategory != PrimitiveCategory.CHAR
                  && primitiveCategory != PrimitiveCategory.VARCHAR
                  && primitiveCategory != PrimitiveCategory.VOID) {
                throw new UDFArgumentTypeException(1,
                    "A string, char, varchar or null argument was expected");

        }
        
        // generate a converter for the argument to use in the evaluate function
        converter = ObjectInspectorConverters.getConverter(arguments[0],PrimitiveObjectInspectorFactory.writableStringObjectInspector);
        
        // return the inspector to check the return value of evaluate function
        return ObjectInspectorFactory.getStandardMapObjectInspector(
                PrimitiveObjectInspectorFactory.writableStringObjectInspector,
                PrimitiveObjectInspectorFactory.writableIntObjectInspector);
    }

    @Override
    public Object evaluate(DeferredObject[] arguments) throws HiveException {
        // check if the argument is null
        if (arguments[0].get() == null) {
            return sortMap;
        }
        
        // populate the word array
        Text s = (Text) converter.convert(arguments[0].get());
        String[] ss = s.toString().split(" ");
        for (String i : ss) {
            if (StringUtils.isBlank(i)) {
                continue;
            }
            if(sortMap.containsKey(new Text(i))) {
                sortMap.replace(new Text(i), new IntWritable(sortMap.get(new Text(i)).get()+1));
            }
            else {
                sortMap.put(new Text(i), new IntWritable(1));
            }   
        }
        return sortMap;
    }

    @Override
    public String getDisplayString(String[] children) {
        // generate the logs to show in the HQL explain clause
        return children[0];
    }
}