本文共 5520 字,大约阅读时间需要 18 分钟。
本文将详细介绍 DataX 项目的搭建过程,包括数据展示、环境依赖、工具类实现以及数据处理等内容。
以下是 emp 表和 dept 表的数据结构:
emp 表
| ename | age | job | hiredate | salary | deptno ||----------|-----|-------|---------|--------|--------|| 姓名 | 年龄 | 职位 | 入职日期 | 工资 | 部门号 |dept 表
| deptno | dname | loc ||--------|-------|--------|| 部门号 | 部门名称 | 地区 |create external table if not exists default.dept ( deptno string, dname string, loc string) row format delimited fields terminated by '\t';create external table if not exists default.emp ( ename string, age int, job string, hiredate date, salary int, deptno string) row format delimited fields terminated by '\t';
项目运行所需依赖如下:
com.github.binarywang:java-testdata-generator:1.1.2
:用于随机生成测试数据org.apache.commons:commons-csv:1.7
:用于 CSV 数据处理junit:4.12
:用于单元测试该工具类用于生成模拟数据:
package com.wangt.product.data; import cn.binarywang.tools.generator.ChineseAddressGenerator; import cn.binarywang.tools.generator.ChineseNameGenerator; import java.util.ArrayList; import java.util.List; import java.util.Random; import java.util.TreeMap; public class ProductDataUtil { public static int productAge() { Random random = new Random(); int age = 20 + random.nextInt(25); return age; } public static int productSal(String job) { Random random = new Random(); int salary = 8000; //,默认工资 if (job.contains("总监")) { salary = 30000 + random.nextInt(3) * 10000 * 2; } else if (job.contains("经理")) { salary = 22000 + random.nextInt(2) * 10000; } else if (job.contains("工程师")) { salary = 13000 + random.nextInt(2) * 10000; } else if (job.contains("专员")) { salary = 15000; } return salary; } public static String productHiredate() { Random random = new Random(); int year = 2014 + random.nextInt(5); int month = 1 + random.nextInt(12); int day = 1 + random.nextInt(31); return year + "-" + month + "-" + day; } public static String productJob(String depNo) { // 依部门编号获取随机职位 TreeMap> source = new TreeMap<>(); source.put("RSA001_20190121", new ArrayList<>()); source.put("RSA002_20180623", new ArrayList<>()); source.put("RSA003_20150422", new ArrayList<>()); source.put("RSA004_20160903", new ArrayList<>()); source.put("RSA005_20120608", new ArrayList<>()); List jobs = source.get(depNo); Random random = new Random(); int index = random.nextInt(jobs.size()); return jobs.get(index); } // 其他方法默认不展示,具体请见 complete code }
package com.wangt.product.data; import cn.binarywang.tools.generator.CSVFileGenerator; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVPrinter; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.List; public class ProductData { private static final String CSV_DELIMITER = "\t"; /** 生成单个员工数据 */ public static String productEmp() { String ename = ProductDataUtil.productName(); int age = ProductDataUtil.productAge(); String deptno = ProductDataUtil.productDeptNo(); String job = ProductDataUtil.productJob(deptno); String hiredate = ProductDataUtil.productHiredate(); int salary = ProductDataUtil.productSal(job); return ename + CSV_DELIMITER + age + CSV_DELIMITER + job + CSV_DELIMITER + hiredate + CSV_DELIMITER + salary + CSV_DELIMITER + deptno; } /** 生成部门数据 */ public static ListproductDept() { List records = new ArrayList<>(); records.add("RSA001_20190121\t产品部\tRSA001"); records.add("RSA002_20180623\t设计部\tRSA002"); records.add("RSA003_20150422\t技术部\tRSA003"); records.add("RSA004_20160903\t运营部\tRSA004"); records.add("RSA005_20120608\t财务部\tRSA005"); return records; } /** 保存数据到 CSV 文件 */ public static void saveEmp(int numRecords, String filePath) throws IOException { try (WritePrintTime csvPrinter = new CSVPrinter(new File(filePath), CSVFormat.DEFAULT.withHeader("ename", "age", "job", "hiredate", "salary", "deptno"))) { for (int i = 0; i < numRecords; i++) { csvPrinter.printRecord(productEmp()); } } } /** 保存部门数据到 CSV 文件 */ public static void saveDept(String filePath) throws IOException { try (WritePrintTime csvPrinter = new CSVPrinter(new File(filePath), CSVFormat.DEFAULT.withHeader("deptno", "dname", "loc"))) { for (String record : productDept()) { csvPrinter.printRecord(record); } } } public static void main(String[] args) throws IOException { saveDept(args[0]); int empNum = Integer.parseInt(args[1]); saveEmp(empNum, args[2]); } } ---## 运行说明 1. 项目将自带所有依赖项,直接运行即可 2. 使用 Maven 打包:
mvn clean install
mvn assembly:jar-with-dependencies3. 执行运行脚本: ```bash java -cp KylinAPI-1.0-SNAPSHOT-jar-with-dependencies.jar com.wangt.product.data.ProductData dept.csv 10000000 emp.csv
通过以上步骤,项目即可顺利运行并生成所需的数据文件。
转载地址:http://cwgyk.baihongyu.com/