<configuration> <property> <name>parquet.column.index.enabled</name> <value>true</value> <description>Enables/disables column indexes.</description> </property> <property> <name>parquet.column.index.cache.size</name> <value>100000</value> <description>Number of elements in the LRU index cache. Cache is disabled when set to zero.</description> </property> </configuration> import org.apache.parquet.column.ColumnWriter; import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.example.data.Group; import org.apache.parquet.example.data.simple.SimpleGroupFactory; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.Type; import org.apache.parquet.schema.Types; MessageType schema = Types.buildMessage() .required(BINARY).as(UTF8).named("name") .required(INT32).named("age") .named("User"); ParquetWriter<Group> writer = ExampleParquetWriter.builder(file) .withSchema(schema) .withConf(conf) .withCompressionCodec(CompressionCodecName.SNAPPY) .build(); Group group = new SimpleGroupFactory(schema).newGroup() group.add("name", "Alice") group.add("age", 25) writer.write(group); writer.close(); import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.ColumnReader; import org.apache.parquet.column.impl.ColumnReadStoreImpl; import org.apache.parquet.io.ColumnIOFactory; import org.apache.parquet.io.MessageColumnIO; import org.apache.parquet.schema.MessageType; ColumnIOFactory columnIOFactory = new ColumnIOFactory(); MessageColumnIO columnIO = columnIOFactory.getColumnIO(schema); ColumnReader columnReader = columnIO.getRecordReader(readStore, columnDescriptor); while (columnReader.getCurrentRepetitionLevel() != 0) { columnReader.consume(); }


上一篇:
下一篇:
切换中文