https://tool.lu/json/
x// 格式化可以合并整行显示{"name":"John Doe","age":35,"email":"john@example.com"}
// 格式化可以展开显示,示例数据更清晰{ "name": "John Doe", "age": 35, "email": "john@example.com"}工具提供了各种跨格式转化
xxxxxxxxxxage35emailjohn@example.comname'John Doe'根据json串自动生成生成 Java 类等功能也非常实用
需要Hive环境
xxxxxxxxxx-- 在Hive中建测试表CREATE TABLE json_test_table ( id INT, json_column STRING) STORED AS TEXTFILE;json_column 字段用来存储需要处理的 json 串
假设 JSON 数据包含用户信息
xxxxxxxxxx{ "name": "John Doe", "age": 35, "email": "john@example.com"}xxxxxxxxxxinsert into json_test_table values (1,'{"name":"John Doe","age":35,"email":"john@example.com"}');
select * from json_test_table where id = 1;+------+---------------------------------------------------------+| id | json_column |+------+---------------------------------------------------------+| 1 | {"name":"John Doe","age":35,"email":"john@example.com"} |+------+---------------------------------------------------------+使用 get_json_object 函数从 JSON 中提取字段:
xxxxxxxxxxSELECT get_json_object(json_column, '$.name') AS name FROM json_test_table where id = 1;
+-----------+| name |+-----------+| John Doe |+-----------+1 row selected (19.655 seconds)
SELECT get_json_object(json_column, '$.name') AS name,get_json_object(json_column, '$.age') AS age FROM json_test_table where id = 1;+-----------+------+| name | age |+-----------+------+| John Doe | 35 |+-----------+------+1 row selected (19.396 seconds)
在 JSON 数据中有嵌套字段的情况下:
xxxxxxxxxx{ "user": { "name": "Alice", "address": { "city": "New York", "zipcode": "10001" } }}xxxxxxxxxxinsert into json_test_table values (2,'{"user":{"name":"Alice","address":{"city":"New York","zipcode":"10001"}}}');
select * from json_test_table where id = 2;+------+---------------------------------------------------------------------------+| id | json_column |+------+---------------------------------------------------------------------------+| 2 | {"user":{"name":"Alice","address":{"city":"New York","zipcode":"10001"}}} |+------+---------------------------------------------------------------------------+提取嵌套字段的值:
xxxxxxxxxxSELECT get_json_object(json_column, '$.user.name') AS user_name, get_json_object(json_column, '$.user.address.city') AS cityFROM json_test_table where id = 2;
+------------+-----------+| user_name | city |+------------+-----------+| Alice | New York |+------------+-----------+1 row selected (19.38 seconds)
假设 JSON 数据包含一个数组:
xxxxxxxxxx{ "tags": ["apple", "banana", "orange"]}xxxxxxxxxxinsert into json_test_table values (3,'{"tags":["apple","banana","orange"]}');
select * from json_test_table where id = 3;+------+--------------------------------------+| id | json_column |+------+--------------------------------------+| 3 | {"tags":["apple","banana","orange"]} |+------+--------------------------------------+从数组中提取值:
xxxxxxxxxxSELECT get_json_object(json_column, '$.tags[0]') AS first_tag, get_json_object(json_column, '$.tags[1]') AS second_tagFROM json_test_table where id = 3;
+------------+-------------+| first_tag | second_tag |+------------+-------------+| apple | banana |+------------+-------------+1 row selected (18.488 seconds)
获取数组的长度:
xxxxxxxxxx-- 复用 示例3中样例数据select * from json_test_table where id = 3;+------+--------------------------------------+| id | json_column |+------+--------------------------------------+| 3 | {"tags":["apple","banana","orange"]} |+------+--------------------------------------+-- 可以看到数组tags对应的值有3个
SELECT size(split(get_json_object(json_column, '$.tags'), ',')) AS tags_lengthFROM json_test_tableWHERE id = 3;
+--------------+| tags_length |+--------------+| 3 |+--------------+1 row selected (35.766 seconds)
xxxxxxxxxx{ "company_name": "wangting_company", "registration": { "registered_capital": 1000000, "registered_date": "2020-01-01", "registered_address": "123 Main St, City" }}
{"company_name":"wangting_company","registration":{"registered_capital":1000000,"registered_date":"2020-01-01","registered_address":"123 Main St, City"}}xxxxxxxxxx-- 创建表CREATE TABLE company_info ( company_name STRING, registered_capital INT, registered_date STRING, registered_address STRING) STORED AS TEXTFILE;
-- 转化插入数据INSERT INTO company_infoSELECT get_json_object(json_data, '$.company_name') AS company_name, get_json_object(json_data, '$.registration.registered_capital') AS registered_capital, get_json_object(json_data, '$.registration.registered_date') AS registered_date, get_json_object(json_data, '$.registration.registered_address') AS registered_addressFROM (SELECT '{"company_name": "wangting_company", "registration": {"registered_capital": 1000000, "registered_date": "2020-01-01", "registered_address": "123 Main St, City"}}' AS json_data) t; +----------------------------+----------------------------------+-------------------------------+----------------------------------+| company_info.company_name | company_info.registered_capital | company_info.registered_date | company_info.registered_address |+----------------------------+----------------------------------+-------------------------------+----------------------------------+| wangting_company | 1000000 | 2020-01-01 | 123 Main St, City |+----------------------------+----------------------------------+-------------------------------+----------------------------------+1 row selected (0.411 seconds)
使用 LATERAL VIEW 和 explode 解析 JSON 数组:
xxxxxxxxxx-- 复用示例3样例数据SELECT id, tagFROM json_test_tableLATERAL VIEW explode(split(get_json_object(json_column, '$.tags'), ',')) exploded_tags AS tagWHERE id = 3;
+-----+------------+| id | tag |+-----+------------+| 3 | ["apple" || 3 | "banana" || 3 | "orange"] |+-----+------------+3 rows selected (17.318 seconds)
-- 仅展示解析数据,实际使用如需去除方括号,可以一并清洗
如果数组包含对象,则提取对象的值:
xxxxxxxxxx{ "users": [ { "name": "Alice", "age": 28 }, { "name": "Bob", "age": 35 } ]}
{"users":[{"name":"Alice","age":28},{"name":"Bob","age":35}]}xxxxxxxxxxinsert into json_test_table values (7,'{"users":[{"name":"Alice","age":28},{"name":"Bob","age":35}]}');
select * from json_test_table where id = 7;+------+---------------------------------------------------------------+| id | json_column |+------+---------------------------------------------------------------+| 7 | {"users":[{"name":"Alice","age":28},{"name":"Bob","age":35}]} |+------+---------------------------------------------------------------+
xxxxxxxxxxSELECT get_json_object(json_column, '$.users[0].name') AS user1_name, get_json_object(json_column, '$.users[1].name') AS user2_nameFROM json_test_table where id = 7;
+-------------+-------------+| user1_name | user2_name |+-------------+-------------+| Alice | Bob |+-------------+-------------+1 row selected (17.372 seconds)
处理 JSON 中的日期字段:
xxxxxxxxxx{ "event_date": "2023-11-21"}
{"event_date":"2023-11-21"}xxxxxxxxxxinsert into json_test_table values (8,'{"event_date":"2023-11-21"}');
select * from json_test_table where id = 8;+------+-----------------------------+| id | json_column |+------+-----------------------------+| 8 | {"event_date":"2023-11-21"} |+------+-----------------------------+
xxxxxxxxxxSELECT get_json_object(json_column, '$.event_date') AS event_dateFROM json_test_table where id = 8;
+-------------+| event_date |+-------------+| 2023-11-21 |+-------------+1 row selected (17.436 seconds)
SELECT id, CAST(get_json_object(json_column, '$.event_date') AS DATE) AS event_date, YEAR(CAST(get_json_object(json_column, '$.event_date') AS DATE)) AS event_year, MONTH(CAST(get_json_object(json_column, '$.event_date') AS DATE)) AS event_monthFROM json_test_tableWHERE id = 8;
+-----+-------------+-------------+--------------+| id | event_date | event_year | event_month |+-----+-------------+-------------+--------------+| 8 | 2023-11-21 | 2023 | 11 |+-----+-------------+-------------+--------------+1 row selected (17.363 seconds)
提取 JSON 中的布尔字段:
xxxxxxxxxx{ "is_active": true}
{"is_active":true}xxxxxxxxxxinsert into json_test_table values (9,'{"is_active":true}');
select * from json_test_table where id = 9;+------+--------------------+| id | json_column |+------+--------------------+| 9 | {"is_active":true} |+------+--------------------+
xxxxxxxxxxSELECT get_json_object(json_column, '$.is_active') AS is_activeFROM json_test_table where id = 9;+------------+| is_active |+------------+| true |+------------+1 row selected (18.401 seconds)
-- 使用WHERE子句根据布尔值进行条件过滤。SELECT *FROM json_test_tableWHERE get_json_object(json_column, '$.is_active') = 'true';
+---------------------+------------------------------+| json_test_table.id | json_test_table.json_column |+---------------------+------------------------------+| 9 | {"is_active":true} |+---------------------+------------------------------+1 row selected (17.327 seconds)
计算 JSON 对象中包含的键值对数量:
xxxxxxxxxxSELECT json_tuple(json_column, 'name', 'age') AS (name, age)FROM json_test_table;
+-----------+-------+| name | age |+-----------+-------+| John Doe | 35 || NULL | NULL || NULL | NULL || NULL | NULL || NULL | NULL || NULL | NULL |+-----------+-------+6 rows selected (17.344 seconds)
-- 只有id=1符合条件 不符合条件均为NULL
json_tuple 提取值从 JSON 对象中使用 json_tuple 提取多个字段:
xxxxxxxxxx{ "name": "John", "age": 30, "address": "New York"}
{"name":"John","age":30,"address":"New York"}xxxxxxxxxxinsert into json_test_table values (11,'{"name":"John","age":30,"address":"New York"}');
select * from json_test_table where id = 11;+------+-----------------------------------------------+| id | json_column |+------+-----------------------------------------------+| 11 | {"name":"John","age":30,"address":"New York"} |+------+-----------------------------------------------+
xxxxxxxxxxSELECT json_tuple(json_column, 'name', 'age', 'address') AS (name, age, address)FROM json_test_table where id = 11;
+-------+------+-----------+| name | age | address |+-------+------+-----------+| John | 30 | New York |+-------+------+-----------+1 row selected (17.414 seconds)
从 JSON 中提取数字字段:
xxxxxxxxxx{ "count": 50}
{"count":50}xxxxxxxxxxinsert into json_test_table values (12,'{"count":50}');
select * from json_test_table where id = 12;+------+--------------+| id | json_column |+------+--------------+| 12 | {"count":50} |+------+--------------+
xxxxxxxxxxSELECT get_json_object(json_column, '$.count') AS countFROM json_test_table where id = 12;
+--------+| count |+--------+| 50 |+--------+1 row selected (19.401 seconds)
get_json_object 过滤数据根据 JSON 中的字段值进行过滤:
xxxxxxxxxxSELECT *FROM json_test_tableWHERE get_json_object(json_column, '$.age') > 30;
+---------------------+----------------------------------------------------+| json_test_table.id | json_test_table.json_column |+---------------------+----------------------------------------------------+| 1 | {"name":"John Doe","age":35,"email":"john@example.com"} |+---------------------+----------------------------------------------------+1 row selected (18.402 seconds)
SELECT *FROM json_test_tableWHERE get_json_object(json_column, '$.age') > 10;
+---------------------+----------------------------------------------------+| json_test_table.id | json_test_table.json_column |+---------------------+----------------------------------------------------+| 1 | {"name":"John Doe","age":35,"email":"john@example.com"} || 11 | {"name":"John","age":30,"address":"New York"} |+---------------------+----------------------------------------------------+2 rows selected (17.298 seconds)
处理 JSON 数据中可能的空值:
xxxxxxxxxx{ "status": null}
{"status":null}xxxxxxxxxxinsert into json_test_table values (14,'{"status":null}');
select * from json_test_table where id = 14;+------+-----------------+| id | json_column |+------+-----------------+| 14 | {"status":null} |+------+-----------------+
xxxxxxxxxxSELECT get_json_object(json_column, '$.status') AS statusFROM json_test_table where id = 14;
+---------+| status |+---------+| NULL |+---------+1 row selected (17.345 seconds)
-- 使用IS NULL或IS NOT NULL来检查字段是否为空SELECT id, json_columnFROM json_test_tableWHERE id = 14 AND get_json_object(json_column, '$.status') IS NULL;
+-----+------------------+| id | json_column |+-----+------------------+| 14 | {"status":null} |+-----+------------------+1 row selected (17.47 seconds)
-- 使用CASE语句对NULL值进行处理或者转换为其他值。SELECT id, CASE WHEN get_json_object(json_column, '$.status') IS NULL THEN 'No Status' ELSE get_json_object(json_column, '$.status') END AS statusFROM json_test_tableWHERE id = 14;
+-----+------------+| id | status |+-----+------------+| 14 | No Status |+-----+------------+1 row selected (17.31 seconds)
get_json_object 判断是否存在字段判断 JSON 是否包含特定字段:
xxxxxxxxxxSELECT CASE WHEN get_json_object(json_column, '$.name') IS NOT NULL THEN 'Exists' ELSE 'Not Exists' END AS field_statusFROM json_test_table;+---------------+| field_status |+---------------+| Exists || Not Exists || Exists || Not Exists || Not Exists || Not Exists || Not Exists || Not Exists || Not Exists |+---------------+
SELECT CASE WHEN get_json_object(json_column, '$.age') IS NOT NULL THEN 'Exists' ELSE 'Not Exists' END AS field_statusFROM json_test_table;
+---------------+| field_status |+---------------+| Exists || Not Exists || Exists || Not Exists || Not Exists || Not Exists || Not Exists || Not Exists || Not Exists |+---------------+
SELECT CASE WHEN get_json_object(json_column, '$.aaaaaaaaa') IS NOT NULL THEN 'Exists' ELSE 'Not Exists' END AS field_statusFROM json_test_table;
+---------------+| field_status |+---------------+| Not Exists || Not Exists || Not Exists || Not Exists || Not Exists || Not Exists || Not Exists || Not Exists || Not Exists |+---------------+9 rows selected (17.337 seconds)
从多层嵌套的 JSON 对象中提取值:
xxxxxxxxxx{ "user": { "details": { "name": "Alice", "age": 30 } }}
{"user":{"details":{"name":"Alice","age":30}}}xxxxxxxxxxinsert into json_test_table values (16,'{"user":{"details":{"name":"Alice","age":30}}}');
select * from json_test_table where id = 16;+------+------------------------------------------------+| id | json_column |+------+------------------------------------------------+| 16 | {"user":{"details":{"name":"Alice","age":30}}} |+------+------------------------------------------------+
xxxxxxxxxxSELECT get_json_object(json_column, '$.user.details.name') AS user_name, get_json_object(json_column, '$.user.details.age') AS user_ageFROM json_test_table where id = 16;
+------------+-----------+| user_name | user_age |+------------+-----------+| Alice | 30 |+------------+-----------+1 row selected (16.253 seconds)
从嵌套的对象数组中提取值:
xxxxxxxxxx{ "employees": [ { "name": "Alice", "department": "HR" }, { "name": "Bob", "department": "Engineering" } ]}
{"employees":[{"name":"Alice","department":"HR"},{"name":"Bob","department":"Engineering"}]}xxxxxxxxxxinsert into json_test_table values (17,'{"employees":[{"name":"Alice","department":"HR"},{"name":"Bob","department":"Engineering"}]}');
select * from json_test_table where id = 17;+------+----------------------------------------------------------------------------------------------+| id | json_column |+------+----------------------------------------------------------------------------------------------+| 17 | {"employees":[{"name":"Alice","department":"HR"},{"name":"Bob","department":"Engineering"}]} |+------+----------------------------------------------------------------------------------------------+
xxxxxxxxxxSELECT get_json_object(json_column, '$.employees[0].name') AS employee1_name, get_json_object(json_column, '$.employees[1].name') AS employee2_nameFROM json_test_table where id = 17;
+-----------------+-----------------+| employee1_name | employee2_name |+-----------------+-----------------+| Alice | Bob |+-----------------+-----------------+1 row selected (17.273 seconds)
取数组对象的多个值
从数组对象中提取多个值:
xxxxxxxxxx{ "items": [ { "id": 1, "name": "Item 1" }, { "id": 2, "name": "Item 2" } ]}
{"items":[{"id":1,"name":"Item 1"},{"id":2,"name":"Item 2"}]}xxxxxxxxxxinsert into json_test_table values (18,'{"items":[{"id":1,"name":"Item 1"},{"id":2,"name":"Item 2"}]}');
select * from json_test_table where id = 18;+------+---------------------------------------------------------------+| id | json_column |+------+---------------------------------------------------------------+| 18 | {"items":[{"id":1,"name":"Item 1"},{"id":2,"name":"Item 2"}]} |+------+---------------------------------------------------------------+
xxxxxxxxxxSELECT get_json_object(json_column, '$.items[0].id') AS item1_id, get_json_object(json_column, '$.items[0].name') AS item1_name, get_json_object(json_column, '$.items[1].id') AS item2_id, get_json_object(json_column, '$.items[1].name') AS item2_nameFROM json_test_table where id = 18;
+-----------+-------------+-----------+-------------+| item1_id | item1_name | item2_id | item2_name |+-----------+-------------+-----------+-------------+| 1 | Item 1 | 2 | Item 2 |+-----------+-------------+-----------+-------------+1 row selected (17.386 seconds)
json_serde库提供了一种在Hive中直接将JSON格式的数据解析成表的方式,可简化处理流程。如果现在有一个json文件:
[app@ali-wangting wangt]$ cat wangt.json
xxxxxxxxxx{"name":"wangting01","age":21,"email":"wangting01@example.com"}{"name":"wangting02","age":22,"email":"wangting02@example.com"}{"name":"wangting03","age":23,"email":"wangting03@example.com"}{"name":"wangting04","age":24,"email":"wangting04@example.com"}{"name":"wangting05","age":25,"email":"wangting05@example.com"}{"name":"wangting06","age":26,"email":"wangting06@example.com"}{"name":"wangting07","age":27,"email":"wangting07@example.com"}{"name":"wangting08","age":28,"email":"wangting08@example.com"}现在希望建立一张表,使得表可以直接映射上数据文件
xxxxxxxxxxCREATE TABLE IF NOT EXISTS wangt_666_json( name string, age int, email string)ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'stored as textfile;
desc wangt_json_table;+-----------+------------+--------------------+| col_name | data_type | comment |+-----------+------------+--------------------+| name | string | from deserializer || age | int | from deserializer || email | string | from deserializer |+-----------+------------+--------------------+xxxxxxxxxx[app@ali-wangting wangt]$ hdfs dfs -ls /user/hive/warehouse/wangt.db/wangt_666_json/[app@ali-wangting wangt]$ hdfs dfs -put wangt.json /user/hive/warehouse/wangt.db/wangt_666_json/[app@ali-wangting wangt]$ hdfs dfs -ls /user/hive/warehouse/wangt.db/wangt_666_json/Found 1 items-rwxrwx--x+ 2 hive hive 512 2023-12-04 16:38 /user/hive/warehouse/wangt.db/wangt_666_json/wangt.json
# 也可以通过hive的sql命令行直接load数据文件load data local inpath '/home/wangt/wangt.json' into table wangt_666_json;xxxxxxxxxxselect * from wangt_666_json;+----------------------+---------------------+-------------------------+| wangt_666_json.name | wangt_666_json.age | wangt_666_json.email |+----------------------+---------------------+-------------------------+| wangting01 | 21 | wangting01@example.com || wangting02 | 22 | wangting02@example.com || wangting03 | 23 | wangting03@example.com || wangting04 | 24 | wangting04@example.com || wangting05 | 25 | wangting05@example.com || wangting06 | 26 | wangting06@example.com || wangting07 | 27 | wangting07@example.com || wangting08 | 28 | wangting08@example.com |+----------------------+---------------------+-------------------------+8 rows selected (0.5 seconds)可以看到数据文件8行记录,成功映射对应了hive中wangt_666_json表的8条数据