Skip to content

Commit cdba4a3

Browse files
author
Abul Basar
committed
update
1 parent d5a8211 commit cdba4a3

File tree

4 files changed

+177
-20
lines changed

4 files changed

+177
-20
lines changed

‎pom.xml‎

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
<version>0.1</version>
1010

1111
<properties>
12-
<sparkVersion>2.4.0</sparkVersion>
12+
<sparkVersion>2.2.1</sparkVersion>
1313
<scalaVersion>2.11</scalaVersion>
1414
<maven.compiler.source>1.8</maven.compiler.source>
1515
<maven.compiler.target>1.8</maven.compiler.target>
@@ -61,7 +61,12 @@
6161
<dependency>
6262
<groupId>org.apache.hbase</groupId>
6363
<artifactId>hbase-client</artifactId>
64-
<version>2.1.0</version>
64+
<version>1.3.0</version>
65+
</dependency>
66+
<dependency>
67+
<groupId>org.apache.hbase</groupId>
68+
<artifactId>hbase-server</artifactId>
69+
<version>1.3.0</version>
6570
</dependency>
6671

6772
<!--dependency>
@@ -71,4 +76,30 @@
7176
</dependency-->
7277

7378
</dependencies>
79+
80+
<build>
81+
82+
<plugins>
83+
<plugin>
84+
<artifactId>maven-assembly-plugin</artifactId>
85+
<version>3.1.0</version>
86+
<configuration>
87+
<descriptorRefs>
88+
<descriptorRef>jar-with-dependencies</descriptorRef>
89+
</descriptorRefs>
90+
</configuration>
91+
<executions>
92+
<execution>
93+
<id>make-assembly</id> <!-- this is used for inheritance merges -->
94+
<phase>package</phase> <!-- bind to the packaging phase -->
95+
<goals>
96+
<goal>single</goal>
97+
</goals>
98+
</execution>
99+
</executions>
100+
</plugin>
101+
102+
</plugins>
103+
</build>
104+
74105
</project>

‎src/main/java/com/example/LoadToHBase.java‎

Lines changed: 64 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,23 @@
44
importorg.apache.hadoop.conf.Configuration;
55
importorg.apache.hadoop.fs.Path;
66
importorg.apache.hadoop.hbase.HBaseConfiguration;
7+
importorg.apache.hadoop.hbase.KeyValue;
78
importorg.apache.hadoop.hbase.TableName;
89
importorg.apache.hadoop.hbase.client.Connection;
910
importorg.apache.hadoop.hbase.client.ConnectionFactory;
1011
importorg.apache.hadoop.hbase.client.Put;
1112
importorg.apache.hadoop.hbase.client.Table;
13+
importorg.apache.hadoop.hbase.io.ImmutableBytesWritable;
14+
importorg.apache.hadoop.hbase.mapreduce.TableOutputFormat;
15+
importorg.apache.hadoop.hbase.util.Bytes;
1216
importorg.apache.spark.SparkConf;
17+
importorg.apache.spark.api.java.JavaPairRDD;
18+
importorg.apache.spark.api.java.JavaRDD;
1319
importorg.apache.spark.api.java.function.ForeachPartitionFunction;
1420
importorg.apache.spark.api.java.function.MapFunction;
1521
importorg.apache.spark.sql.*;
22+
importscala.Tuple2;
23+
1624

1725
importjava.io.IOException;
1826
importjava.io.Serializable;
@@ -29,7 +37,8 @@ public class LoadToHBase implements Serializable{
2937
publicLoadToHBase(){
3038
conf = newSparkConf()
3139
.setAppName(getClass().getName())
32-
.setIfMissing("spark.master", "local[*]");
40+
.setIfMissing("spark.master", "local[*]")
41+
.setIfMissing("spark.driver.memory", "4g");
3342
spark = SparkSession.builder().config(conf).getOrCreate();
3443
}
3544

@@ -52,13 +61,19 @@ private void saveStockRecords(Iterator<Stock> rows){
5261
conn = ConnectionFactory.createConnection(configuration);
5362
table = conn.getTable(TableName.valueOf("ns1:stocks"));
5463
List<Put> puts = newArrayList<>();
64+
intbatchSize = 2000;
65+
intcount = 0;
5566
while (rows.hasNext()){
5667
Stockstock = rows.next();
5768
puts.add(stock.toPut());
69+
if(puts.size() % batchSize == 0){
70+
table.put(puts);
71+
puts.clear();
72+
}
73+
++count;
5874
}
59-
System.out.println(String.format("Saving %d records", puts.size()));
60-
Object[] results = newObject[puts.size()];
6175
table.put(puts);
76+
System.out.println(String.format("Saving %d records", count));
6277
table.close();
6378
} catch (IOExceptionex){
6479
ex.printStackTrace();
@@ -71,32 +86,63 @@ private void saveStockRecords(Iterator<Stock> rows){
7186
}
7287
}
7388

89+
privateStockrowToStock(Rowrow){
90+
Stockstock = newStock();
91+
92+
stock.setDate(row.getAs("date"));
93+
94+
stock.setOpen(row.getAs("open"));
95+
stock.setClose(row.getAs("close"));
96+
stock.setHigh(row.getAs("high"));
97+
stock.setLow(row.getAs("low"));
98+
stock.setClose(row.getAs("close"));
99+
stock.setAdjclose(row.getAs("adjclose"));
100+
stock.setVolume(row.getAs("volume"));
101+
stock.setSymbol(row.getAs("symbol"));
102+
103+
returnstock;
104+
}
105+
74106
publicvoidsaveToHBase(Stringpath){
75107
Dataset<Row> dataset = loadCsv(path).withColumn("date"
76108
, functions.expr("cast(`date` as date) as `date`"));
77109

78-
Dataset<Stock> stockRows = dataset.map((MapFunction<Row, Stock>) row ->{
79-
Stockstock = newStock();
110+
Dataset<Stock> stockRows = dataset.map((MapFunction<Row, Stock>) row -> rowToStock(row), Encoders.bean(Stock.class));
80111

81-
stock.setDate(row.getAs("date"));
112+
stockRows.show();
82113

83-
stock.setOpen(row.getAs("open"));
84-
stock.setClose(row.getAs("close"));
85-
stock.setHigh(row.getAs("high"));
86-
stock.setLow(row.getAs("low"));
87-
stock.setClose(row.getAs("close"));
88-
stock.setAdjclose(row.getAs("adjclose"));
89-
stock.setVolume(row.getAs("volume"));
90-
stock.setSymbol(row.getAs("symbol"));
114+
stockRows.foreachPartition((ForeachPartitionFunction<Stock>) rows -> saveStockRecords(rows));
115+
}
91116

92-
returnstock;
93-
}, Encoders.bean(Stock.class));
117+
118+
publicvoidcreateHFiles(Stringpath, StringoutputPath){
119+
Dataset<Row> dataset = loadCsv(path).withColumn("date"
120+
, functions.expr("cast(`date` as date) as `date`"));
121+
122+
Dataset<Stock> stockRows = dataset.map((MapFunction<Row, Stock>) row -> rowToStock(row), Encoders.bean(Stock.class));
94123

95124
stockRows.show();
96125

97-
stockRows.foreachPartition((ForeachPartitionFunction<Stock>) rows -> saveStockRecords(rows));
126+
JavaPairRDD<ImmutableBytesWritable, Put> pairRdd = stockRows.javaRDD().mapToPair(r ->
127+
newTuple2<>(r.toKey(), r.toPut()));
128+
129+
130+
Configurationconfiguration = HBaseConfiguration.create();
131+
StringresourcePath = LoadToHBase.class
132+
.getClassLoader()
133+
.getResource("hbase-site.xml")
134+
.getPath();
135+
configuration.addResource(newPath(resourcePath));
136+
137+
configuration.set(TableOutputFormat.OUTPUT_TABLE, "ns1:stocks");
138+
pairRdd.saveAsNewAPIHadoopFile(outputPath
139+
, ImmutableBytesWritable.class
140+
, Put.class
141+
, TableOutputFormat.class
142+
, configuration);
98143

99144
}
145+
100146
publicvoidclose(){
101147
spark.close();
102148
}
@@ -105,6 +151,7 @@ public static void main(String[] agrs){
105151
Stringpath = "/data/stocks.csv";
106152
LoadToHBaseloadToHBase = newLoadToHBase();
107153
loadToHBase.saveToHBase(path);
154+
//loadToHBase.createHFiles(path, "/tmp/stocks_hfile");
108155
loadToHBase.close();
109156

110157
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
packagecom.example;
2+
3+
importcom.example.helper.Stock;
4+
importorg.apache.hadoop.conf.Configuration;
5+
importorg.apache.hadoop.fs.Path;
6+
importorg.apache.hadoop.hbase.HBaseConfiguration;
7+
importorg.apache.hadoop.hbase.client.Result;
8+
importorg.apache.hadoop.hbase.io.ImmutableBytesWritable;
9+
importorg.apache.hadoop.hbase.mapreduce.TableInputFormat;
10+
importorg.apache.spark.SparkConf;
11+
importorg.apache.spark.api.java.JavaPairRDD;
12+
importorg.apache.spark.sql.Dataset;
13+
importorg.apache.spark.sql.Row;
14+
importorg.apache.spark.sql.SparkSession;
15+
16+
publicclassQueryHBaseTable{
17+
privateSparkSessionspark = null;
18+
privateSparkConfconf = null;
19+
publicQueryHBaseTable(){
20+
conf = newSparkConf()
21+
.setAppName(getClass().getName())
22+
.setIfMissing("spark.master", "local[*]")
23+
.setIfMissing("spark.driver.memory", "4g");
24+
spark = SparkSession.builder().config(conf).getOrCreate();
25+
}
26+
27+
publicvoidloadFromHBase( ){
28+
29+
Configurationconfiguration = HBaseConfiguration.create();
30+
Stringpath = LoadToHBase.class
31+
.getClassLoader()
32+
.getResource("hbase-site.xml")
33+
.getPath();
34+
configuration.addResource(newPath(path));
35+
36+
configuration.set(TableInputFormat.INPUT_TABLE, "ns1:stocks");
37+
38+
39+
JavaPairRDD<ImmutableBytesWritable, Result> rows = spark.sparkContext().newAPIHadoopRDD(configuration
40+
, TableInputFormat.class
41+
, ImmutableBytesWritable.class
42+
, Result.class).toJavaRDD().mapToPair(r -> r);
43+
44+
Dataset<Row> df = spark.createDataFrame(rows.map(r -> Stock.parse(r._2)), Stock.class);
45+
df.show();
46+
47+
}
48+
49+
publicstaticvoidmain(String[] args){
50+
newQueryHBaseTable().loadFromHBase();
51+
}
52+
}

‎src/main/java/com/example/helper/Stock.java‎

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
11
packagecom.example.helper;
22

3+
importorg.apache.hadoop.hbase.KeyValue;
34
importorg.apache.hadoop.hbase.client.Put;
45
importorg.apache.hadoop.hbase.client.Result;
6+
importorg.apache.hadoop.hbase.io.ImmutableBytesWritable;
57
importorg.apache.hadoop.hbase.util.Bytes;
8+
importscala.Tuple2;
69

710
importjava.io.Serializable;
811
importjava.sql.Date;
9-
12+
importjava.util.ArrayList;
13+
importjava.util.Iterator;
14+
importjava.util.List;
1015

1116

1217
publicclassStockimplementsSerializable{
@@ -130,6 +135,22 @@ public Put toPut(){
130135
returnput;
131136
}
132137

138+
publicIterator<Tuple2<ImmutableBytesWritable, KeyValue>> toKVPairs(){
139+
longepoch = date.getTime();
140+
byte[] rowkey = Bytes.toBytes(String.format("%s-%d", symbol, epoch));
141+
List<KeyValue> keyValues = newArrayList<>();
142+
143+
keyValues.add(newKeyValue(rowkey, infoColumnFamily, openCol, Bytes.toBytes(open)));
144+
keyValues.add(newKeyValue(rowkey, infoColumnFamily, closeCol, Bytes.toBytes(close)));
145+
keyValues.add(newKeyValue(rowkey, infoColumnFamily, highCol, Bytes.toBytes(high)));
146+
keyValues.add(newKeyValue(rowkey, infoColumnFamily, lowCol, Bytes.toBytes(low)));
147+
keyValues.add(newKeyValue(rowkey, infoColumnFamily, adjcloseCol, Bytes.toBytes(adjclose)));
148+
keyValues.add(newKeyValue(rowkey, infoColumnFamily, dateCol, Bytes.toBytes(epoch)));
149+
keyValues.add(newKeyValue(rowkey, infoColumnFamily, symbolCol, Bytes.toBytes(symbol)));
150+
151+
returnkeyValues.stream().map(r -> newTuple2<>(newImmutableBytesWritable(rowkey), r)).iterator();
152+
}
153+
133154
publicstaticStockparse(Resultresult){
134155
Stockstock = newStock();
135156
stock.setOpen(Bytes.toDouble(result.getValue(infoColumnFamily, openCol)));
@@ -148,6 +169,12 @@ public static Stock parse(Result result){
148169

149170
}
150171

172+
publicImmutableBytesWritabletoKey(){
173+
longepoch = date.getTime();
174+
byte[] rowkey = Bytes.toBytes(String.format("%s-%d", symbol, epoch));
175+
returnnewImmutableBytesWritable(rowkey);
176+
}
177+
151178

152179

153180

0 commit comments

Comments
(0)