<configuration>
<property>
<name>parquet.column.index.enabled</name>
<value>true</value>
<description>Enables/disables column indexes.</description>
</property>
<property>
<name>parquet.column.index.cache.size</name>
<value>100000</value>
<description>Number of elements in the LRU index cache. Cache is disabled when set to zero.</description>
</property>
</configuration>
import org.apache.parquet.column.ColumnWriter;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.Type;
import org.apache.parquet.schema.Types;
MessageType schema = Types.buildMessage()
.required(BINARY).as(UTF8).named("name")
.required(INT32).named("age")
.named("User");
ParquetWriter<Group> writer = ExampleParquetWriter.builder(file)
.withSchema(schema)
.withConf(conf)
.withCompressionCodec(CompressionCodecName.SNAPPY)
.build();
Group group = new SimpleGroupFactory(schema).newGroup()
group.add("name", "Alice")
group.add("age", 25)
writer.write(group);
writer.close();
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.ColumnReader;
import org.apache.parquet.column.impl.ColumnReadStoreImpl;
import org.apache.parquet.io.ColumnIOFactory;
import org.apache.parquet.io.MessageColumnIO;
import org.apache.parquet.schema.MessageType;
ColumnIOFactory columnIOFactory = new ColumnIOFactory();
MessageColumnIO columnIO = columnIOFactory.getColumnIO(schema);
ColumnReader columnReader = columnIO.getRecordReader(readStore, columnDescriptor);
while (columnReader.getCurrentRepetitionLevel() != 0) {
columnReader.consume();
}