Azure Data Lake Series: Working with JSON - Part 3
/Important to Note: If you are just beginning and trying to figure out how to parse JSON documents with U-SQL and Azure Data Lake Analytics, I highly recommend kicking off with Part 1 and Part 2 in this series.
Prerequisites
- An Azure subscription
- An Azure Data Lake Store account
- An Azure Data Lake Analytics account
- Uploaded and registered custom .NET JSON assemblies (see Part 1)
Exercise - Top Restaurants in London
In this final part, I'm going to illustrate how we can extract data from multiple levels of a hierarchical JSON document across multiple files. In this example, I will be using JSON documents that contain data on popular restaurants across London.
Note:
- The purpose of this article is to focus on the parsing capabilities of U-SQL.
- I will not be providing the Python script used to source the JSON files themselves or provide a copy of the documents. The data source has been chosen for educational purposes only.
- That said, a slimmed down representation of what each document looks like is provided below.
- In reality, each document contains an array of 250 restaurants. The output will be the result of parsing in excess of 20,000 restaurants spread across 87 files.
{
"status": "OK",
"response": {
"pageData": {
"count": 250,
"q_data": [
],
"localityInfo": {
"entity_id": "61",
"entity_type": "city_id"
},
"total_count": 27576,
"appliedFilters": [
{
"default_value": true,
"text": "Rating",
"filterType": "sortbyFilters",
"filterId": "sortby_4",
"get_parameter": {
"sort": "best"
}
}
],
"ads_data": {
"url": "",
"has_ads": false
},
"pageHeading": "Best Restaurants in London",
"push_to_app_experiment_enabled": false,
"reveal_ads_data": {
"should_show": false
},
"header_banner": {
"url": "",
"should_show": false
},
"restaurants": [
{
"cuisines": [
{
"url": "",
"id": "141",
"name": "Steak",
"title": "View all Steak Restaurants in London"
}
],
"basic_info": {
"rating": {
"aria-label": "Rated 4.9",
"votes_text": "Based on 349 votes",
"display": "4.9",
"level": "level-8"
},
"base64": "data:image\/jpeg;base64,\/9j\/4AAQSkZJRgABAQEAYABgAAD\/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT\/2wBDAQMEBAUEBQkFBQkUDQsNFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBT\/wAARCAAyADIDASIAAhEBAxEB\/8QAHQABAQABBAMAAAAAAAAAAAAAAAcGBAUICQECA\/\/EADEQAAEDAwIDBgMJAAAAAAAAAAEAAgMEBQYHERIhMQgTUWFxkhQVQRgjMkJERWJykf\/EABoBAAIDAQEAAAAAAAAAAAAAAAADAQQGBQf\/xAAjEQABBAAGAwEBAAAAAAAAAAAAAQIDEQQSExRRUgUhMTJB\/9oADAMBAAIRAxEAPwDtTRTKh15xyvjL4qpjh5FG68Y\/JKY21DXOHglarLqxmm\/7RTUWCxas2iVgcJRsvLdWLQ47d8FOo26v2Gm+rozlFgs2rNngjL3Tt2C2Ct7R2LW9\/DNWRsPmVKvai0qhpv8AtFZRR77TmIH9wi9wRGdvJGR3BDsE0qp7fbHCpc5ruH6laRuN0NprpXk78+W5WdZ7dJbdXtp6Zha09dgpnf6yoDHPDHOJ8As\/5FiQyMXlTReNRZGORf4h97zmtHaSIg4A+G6xi46p0tM0kOG\/qpJndNfbjc3PpmScPoVh1Xj2Rz\/jZL\/hUvfh0dmavs6scaZKcXSbVyGeMx8e\/F9N1OtTB81tDqqGR8b+vIrGLJjdwpqhr6psnCD0IK3vL6iSW1OiihdwhvguF5dyuYySJaVFLOHay3Mcnojpra9p2+Ll5fyRaWRlSJHfcv6n8qJCTS8lbJHwd2N001t9zqO9ljBd6LQHR60PaQ+FrgfEKhIvQJIWS\/tLMXHNJF+Fomp0NsDjuaRhP9V6P0Jx5\/6SP2qmokbODqO3k\/YlE3Z6xybrSs9q22t7MmN1cRYaZgB8laUULgsOv1pO8n7HHc9jnEySfhW8\/JFyIRM2sPUVuJewREVorhERABERABERAH\/\/2Q==",
"known_for": "",
"city_id": "61",
"opening_soon_flag": 0,
"photos_tab_urls": {
"food": "",
"ambience": "",
"all": ""
},
"is_open_now": true,
"is_bookmarked": false,
"disclaimer_data": [
],
"subzone_name_verbose": "Soho",
"beenthere_count": 546,
"city": "London",
"thumb": "",
"is_pre_payment_booking_enabled": false,
"timings_display": "12 Noon to 11 PM (Mon-Wed),12 Noon to 11:30 PM (Thu),12 Noon to 12 Midnight (Fri),11:45 AM to 12 Midnight (Sat),12 Noon to 10:30 PM (Sun)",
"serves_veg_flag": 0,
"is_dark_kitchen": false,
"wishlisters_count": 1976,
"event_phone_data": [
],
"res_id": "6114338",
"website": "http:\/\/flatironsteak.co.uk\/",
"openInAppURL": "",
"temp_closed_flag": 0,
"pure_veg_flag": 0,
"restaurant_address": "17 Beak Street, Soho, London W1F 9RW",
"hasOrder": false,
"hasBooking": false,
"subzone_id": "61043",
"phoneData": {
"phone_string": "Not available for this place",
"showWhatsThis": false,
"phone_string_available": 0,
"mobile_string": "",
"mobile_string_display": "",
"isDirect": false,
"has_acd": 1,
"phone_no_label": "Phone number",
"isd_code": "44",
"number_available": true,
"zm-restaurant-table-res-at": "Table reservations at Flat Iron, London",
"resinfo_phone_title": "Flat Iron, Soho phone number",
"isACD": false,
"acd_active_flag": 0
},
"short_link": "",
"is_seating_available": true,
"name": "Flat Iron",
"res_subzone_url": "",
"real_city": "London",
"urls": {
"INFO": "",
"MAPS": "",
"MENU": "",
"TABLE_RESERVATION": "",
"ORDER": "",
"REVIEWS": "",
"PHOTOS": "",
"DAILY_MENU": "",
"EVENTS": ""
},
"subzone_name": "Soho",
"hidden_reviewers_count": 167,
"user_review_count": 351
},
"photos_data": {
"photo_categories": [
{
"category": "all",
"category_label": "All Photos",
"count": 4,
"total_photos": 1587,
"photos": [
{
"restaurant": "6114338",
"url": "",
"is_liked": 0,
"author": "1",
"like_count": 0,
"comments_count": 0,
"friendlyTime": "Aug 08, 2016",
"alternative": "Flat Iron, Soho Photos",
"id": "r_NDI4NTE2MTgwNw",
"page_url": ""
},
{
"restaurant": "6114338",
"url": "",
"is_liked": 0,
"author": "1",
"like_count": 1,
"comments_count": 0,
"friendlyTime": "Aug 08, 2016",
"alternative": "Flat Iron, Soho Photos",
"id": "r_NzQzMTY3ODUwNz",
"page_url": ""
},
{
"restaurant": "6114338",
"url": "",
"is_liked": 0,
"author": "1",
"like_count": 0,
"comments_count": 0,
"friendlyTime": "Aug 08, 2016",
"alternative": "Flat Iron, Soho Photos",
"id": "r_ODIyNDcwNzMzND",
"page_url": ""
},
{
"restaurant": "6114338",
"url": "",
"is_liked": 0,
"author": "1",
"like_count": 0,
"comments_count": 0,
"friendlyTime": "Aug 08, 2016",
"alternative": "Flat Iron, Soho Photos",
"id": "r_MTU4MDI4NzA2Nj",
"page_url": ""
}
]
}
],
"authors": [
{
"basic_info": {
"profile_picture": "",
"name": "",
"id": "1"
}
}
]
},
"search_menu_img": {
"url": "",
"thumb_url": ""
},
"book_info": {
"button__title_text_color": "#ffffff",
"mezzoProvider": 0,
"button__title": "Book a Table",
"should_open_external_tab_booking": 0,
"trprovider": "",
"button__background_color": "#3694f2",
"href": "",
"is_available": false,
"button__subtitle_text_color": "#ffffff",
"external_booking_link": "",
"button__subtitle": ""
},
"all_timings": [
{
"bold": false,
"day": "Mon",
"time": "12Noon to 11PM"
},
{
"bold": true,
"day": "Tue",
"time": "12Noon to 11PM"
},
{
"bold": false,
"day": "Wed",
"time": "12Noon to 11PM"
},
{
"bold": false,
"day": "Thu",
"time": "12Noon to 11:30PM"
},
{
"bold": false,
"day": "Fri",
"time": "12Noon to 12Midnight"
},
{
"bold": false,
"day": "Sat",
"time": "11:45AM to 12Midnight"
},
{
"bold": false,
"day": "Sun",
"time": "12Noon to 10:30PM"
}
],
"o2_partner_info": null,
"offers": null,
"collections": [
],
"highlight_dish": "",
"menus_data": {
"image_menus": [
{
"menu_type": "FOOD",
"name": "Food",
"photos": [
{
"url": "",
"id": "127904773",
"uploaded_by": 4294311
}
]
},
{
"menu_type": "BAR",
"name": "Bar",
"photos": [
{
"url": "",
"id": "127904771",
"uploaded_by": 4294311
},
{
"url": "",
"id": "127904775",
"uploaded_by": 4294311
}
]
}
],
"text_menus": [
]
},
"distance_from_loc": "",
"res_id": "6114338",
"events": null,
"ads_meta_data": [
],
"cft_data": {
"breakfast_cft": null,
"cost_beer": {
"text": "\u00a34.50 ",
"heading": "for a pint of beer (approx.)"
},
"title_text": "The cost for two is computed as follows: Average of 2 mid ranged Appetizers + 2 Mains + 2 Beverages + 1 Dessert. The actual cost you incur at a restaurant might change depending on your appetite, or with changes in restaurant menu prices. This assumes no use of the bar facility, except for places in the "Go out for drinks" section.",
"payment_info": "<span itemprop=\"paymentAccepted\">Cash<\/span> and <span itemprop=\"paymentAccepted\" title=\"Cards accepted at Flat Iron\">Cards accepted<\/span>",
"numeric_cft": {
"text": " \u00a335 for two people (approx.)",
"subtext": [
],
"heading": "Average"
},
"non_numeric_cft": null,
"lunch_cft": null
},
"highlight_review": "",
"highlight_tag": "",
"delivery_status": {
"hasOnlineDelivery": 0,
"online_status_code": 2,
"isDeliveringNow": 0
},
"promotions": [
],
"daily_menus_data": {
"daily_menus": null,
"long_menus": null
},
"is_ad_boosted": false
},
{
"cuisines": [
{
"url": "",
"id": "133",
"name": "British",
"title": "View all British Restaurants in London"
}
],
"basic_info": {
"rating": {
"aria-label": "Rated 4.9",
"votes_text": "Based on 765 votes",
"display": "4.9",
"level": "level-8"
},
"base64": "data:image\/jpeg;base64,\/9j\/4AAQSkZJRgABAQEASABIAAD\/2wBDAAQDAwQDAwQEBAQFBQQFBwsHBwYGBw4KCggLEA4RERAOEA8SFBoWEhMYEw8QFh8XGBsbHR0dERYgIh8cIhocHRz\/2wBDAQUFBQcGBw0HBw0cEhASHBwcHBwcHBwcHBwcHBwcHBwcHBwcHBwcHBwcHBwcHBwcHBwcHBwcHBwcHBwcHBwcHBz\/wAARCAAyADIDAREAAhEBAxEB\/8QAGwABAAIDAQEAAAAAAAAAAAAAAAMEAQIIBQn\/xAApEAABAwEHAwQDAAAAAAAAAAABAAIRAwQFCBNRUpESIUEUGDFCIqHw\/8QAGAEBAQEBAQAAAAAAAAAAAAAAAAIBAwT\/xAAcEQEAAgMBAQEAAAAAAAAAAAAAARECEhMDITH\/2gAMAwEAAhEDEQA\/AO\/pGqBI1QJGqBI1QJGqBI1QJGqDh5mMouHemuM+jrGFrFLGGX\/RZ1XyZfjEDTHQt6M5J6eL8PbPSsn1OSOrjBFM92J1bxQPxktb9f2t6HFr7y27E6J5uLjRAMhea3pjFGa2UexWx9VSM1eszJVBTtbmPgErJhlLNQ5zJLlE5FPLtDi10SVeMkoYdqVVudwv1bc10wVGrrjlCo2qXv7nsqiKJlbZRLvhPxNlWyFn5BZdqtVNoe09MrYwROSSjZ3Wh0lVGKMsl70Caot0g3B1fXmV20THpSRuD2+Wnzws0b0ehRwk3wxkQZUT5nRq\/CVfT5+eEjyOio7B9fLnTB4VxgzdasuEe+aPg8LdU7LXtOvn+CzWW7PoHlM2jhdXJnLZtHCBls2jhAy2bRwgZbNo4QMtm0cIGWzaOEGyAgICAgICD\/\/Z",
"known_for": "",
"city_id": "61",
"opening_soon_flag": 0,
"photos_tab_urls": {
"food": "",
"ambience": "",
"all": ""
},
"is_open_now": true,
"is_bookmarked": false,
"disclaimer_data": [
],
"subzone_name_verbose": "Heron Tower, City of London",
"beenthere_count": 1063,
"city": "London",
"thumb": "",
"is_pre_payment_booking_enabled": false,
"timings_display": "24 Hours (Mon-Sun)",
"serves_veg_flag": 0,
"is_dark_kitchen": false,
"wishlisters_count": 2696,
"event_phone_data": [
],
"res_id": "6114829",
"website": "http:\/\/duckandwaffle.com\/",
"openInAppURL": "",
"temp_closed_flag": 0,
"pure_veg_flag": 0,
"restaurant_address": "Heron Tower, 110 Bishopsgate, City of London, London EC2N 4AY",
"hasOrder": false,
"hasBooking": false,
"subzone_id": "61027",
"phoneData": {
"phone_string": "020 36407310",
"showWhatsThis": false,
"phone_string_available": 1,
"mobile_string": "020 36407310",
"mobile_string_display": "020 36407310",
"isDirect": false,
"has_acd": 1,
"phone_no_label": "Phone number",
"isd_code": "44",
"number_available": true,
"zm-restaurant-table-res-at": "Table reservations at Duck & Waffle, London",
"resinfo_phone_title": "Duck & Waffle, City of London phone number",
"isACD": false,
"acd_active_flag": 0
},
"short_link": "",
"is_seating_available": true,
"name": "Duck & Waffle",
"res_subzone_url": "",
"real_city": "London",
"urls": {
"INFO": "",
"MAPS": "",
"MENU": "",
"TABLE_RESERVATION": "",
"ORDER": "",
"REVIEWS": "",
"PHOTOS": "",
"DAILY_MENU": "",
"EVENTS": ""
},
"subzone_name": "City of London",
"hidden_reviewers_count": 318,
"user_review_count": 735
},
"photos_data": {
"photo_categories": [
{
"category": "all",
"category_label": "All Photos",
"count": 4,
"total_photos": 1084,
"photos": [
{
"restaurant": "6114829",
"url": "",
"is_liked": 0,
"author": "1",
"like_count": 11,
"comments_count": 0,
"friendlyTime": "Jun 28, 2016",
"alternative": "Duck & Waffle, City of London Photos",
"id": "r_NzI2MDM3MjgwOT",
"page_url": ""
},
{
"restaurant": "6114829",
"url": "",
"is_liked": 0,
"author": "1",
"like_count": 8,
"comments_count": 0,
"friendlyTime": "Jun 28, 2016",
"alternative": "Duck & Waffle, City of London Photos",
"id": "r_MzA0NjU4Mzc5Nz",
"page_url": ""
},
{
"restaurant": "6114829",
"url": "",
"is_liked": 0,
"author": "1",
"like_count": 3,
"comments_count": 0,
"friendlyTime": "Jun 28, 2016",
"alternative": "Duck & Waffle, City of London Photos",
"id": "r_Nzc2ODMzNzg5MD",
"page_url": ""
},
{
"restaurant": "6114829",
"url": "",
"is_liked": 0,
"author": "1",
"like_count": 2,
"comments_count": 0,
"friendlyTime": "Jun 28, 2016",
"alternative": "Duck & Waffle, City of London Photos",
"id": "r_zU5Nzk5NDc2NzU",
"page_url": ""
}
]
}
],
"authors": [
{
"basic_info": {
"profile_picture": "",
"name": "",
"id": "1"
}
}
]
},
"search_menu_img": {
"url": "",
"thumb_url": ""
},
"book_info": {
"button__title_text_color": "#ffffff",
"mezzoProvider": 0,
"button__title": "Book a Table",
"should_open_external_tab_booking": 0,
"trprovider": "",
"button__background_color": "#3694f2",
"href": "",
"is_available": false,
"button__subtitle_text_color": "#ffffff",
"external_booking_link": "",
"button__subtitle": ""
},
"all_timings": [
{
"bold": false,
"day": "Mon",
"time": "24 Hours"
},
{
"bold": true,
"day": "Tue",
"time": "24 Hours"
},
{
"bold": false,
"day": "Wed",
"time": "24 Hours"
},
{
"bold": false,
"day": "Thu",
"time": "24 Hours"
},
{
"bold": false,
"day": "Fri",
"time": "24 Hours"
},
{
"bold": false,
"day": "Sat",
"time": "24 Hours"
},
{
"bold": false,
"day": "Sun",
"time": "24 Hours"
}
],
"o2_partner_info": null,
"offers": null,
"collections": [
{
"url": "",
"id": "20",
"name": "Great Breakfasts"
}
],
"highlight_dish": "",
"menus_data": {
"image_menus": [
{
"menu_type": "SPECIAL",
"name": "Special",
"photos": [
{
"url": "",
"id": "131858918",
"uploaded_by": 32032798
}
]
},
{
"menu_type": "FOOD",
"name": "Food",
"photos": [
{
"url": "",
"id": "129795621",
"uploaded_by": 30241803
},
{
"url": "",
"id": "129795622",
"uploaded_by": 30241803
},
{
"url": "",
"id": "129795623",
"uploaded_by": 30241803
},
{
"url": "",
"id": "129795624",
"uploaded_by": 30241803
},
{
"url": "",
"id": "129795625",
"uploaded_by": 30241803
},
{
"url": "",
"id": "129795627",
"uploaded_by": 30241803
},
{
"url": "",
"id": "129795628",
"uploaded_by": 30241803
},
{
"url": "",
"id": "129795629",
"uploaded_by": 30241803
},
{
"url": "",
"id": "129795630",
"uploaded_by": 30241803
},
{
"url": "",
"id": "129795631",
"uploaded_by": 30241803
},
{
"url": "",
"id": "129795632",
"uploaded_by": 30241803
},
{
"url": "",
"id": "129795633",
"uploaded_by": 30241803
},
{
"url": "",
"id": "129795636",
"uploaded_by": 30241803
}
]
},
{
"menu_type": "BAR",
"name": "Bar",
"photos": [
{
"url": "",
"id": "129795626",
"uploaded_by": 30241803
},
{
"url": "",
"id": "129795634",
"uploaded_by": 30241803
},
{
"url": "",
"id": "129795669",
"uploaded_by": 30241803
},
{
"url": "",
"id": "129795670",
"uploaded_by": 30241803
},
{
"url": "",
"id": "129795671",
"uploaded_by": 30241803
},
{
"url": "",
"id": "129795672",
"uploaded_by": 30241803
},
{
"url": "",
"id": "129795673",
"uploaded_by": 30241803
},
{
"url": "",
"id": "129795674",
"uploaded_by": 30241803
}
]
}
],
"text_menus": [
]
},
"distance_from_loc": "",
"res_id": "6114829",
"events": null,
"ads_meta_data": [
],
"cft_data": {
"breakfast_cft": null,
"cost_beer": [
],
"title_text": "The cost for two is computed as follows: Average of 2 mid ranged Appetizers + 2 Mains + 2 Beverages + 1 Dessert. The actual cost you incur at a restaurant might change depending on your appetite, or with changes in restaurant menu prices. This assumes no use of the bar facility, except for places in the "Go out for drinks" section.",
"payment_info": "<span itemprop=\"paymentAccepted\">Cash<\/span> and <span itemprop=\"paymentAccepted\" title=\"Cards accepted at Duck & Waffle\">Cards accepted<\/span>",
"numeric_cft": {
"text": " \u00a355 for two people (approx.)",
"subtext": [
],
"heading": "Average"
},
"non_numeric_cft": null,
"lunch_cft": null
},
"highlight_review": "",
"highlight_tag": "",
"delivery_status": {
"hasOnlineDelivery": 0,
"online_status_code": 2,
"isDeliveringNow": 0
},
"promotions": [
],
"daily_menus_data": {
"daily_menus": null,
"long_menus": null
},
"is_ad_boosted": false
}
]
},
"pageInfo": {
"url": "",
"description": "Best Restaurants in London. Menus, Photos, Ratings and Reviews for Best Restaurants in London",
"type": "COMMON",
"page": "SEARCH",
"title": "Best Restaurants in London"
}
},
"statusCode": 200
}
Goal
Extract key pieces of information (name, address, rating, cost, number of reviews, etc.) and flatten the data into a CSV where each row will represent a single restaurant.
Sample Output
"restaurant_name","address","city","subzone","website","rating","cost","visited","bookmarks","reviews","cuisine"
"10 Greek Street","10 Greek Street, Soho, London W1D 4DH","London","Soho","http://www.10greekstreet.com/","4.1","25","85","189","94","European, British"
"A Cena","418 Richmond Road, Twickenham, London TW1 2EB","London","Twickenham","http://acena.co.uk","4.2","50","39","27","41","Italian"
"A.Wong","70 Wilton Road, Victoria, London SW1V 1DE","London","Victoria","http://www.awong.co.uk","4.1","45","86","203","93","Chinese, Dim Sum"
Challenges
- The data is spread over multiple files.
- Key pieces of information for each restaurant is spread across varying levels in the hierarchy.
- There is some inconsistency with the data (e.g. some fields such as cost are null).
- Each restaurant can have one or many related cuisines (i.e. an array), but we want to keep the output as one row per restaurant.
Challenge 1 - Multiple Files
In Parts 1 and 2 we were only working with a single file and therefore the path was written in its absolute entirety. In this example we want to point U-SQL to a "file set" by specifying a pattern within curly braces. See official documentation for more information.
The snippet below would match a set of files that begin with document_ and end in .json (e.g. document_0, document_1, ..., document_86.json).
// A. CREATE ASSEMBLY: Register assemblies (if they do not already exist).
CREATE ASSEMBLY IF NOT EXISTS [Newtonsoft.Json] FROM @"Newtonsoft.Json.dll";
CREATE ASSEMBLY IF NOT EXISTS [Microsoft.Analytics.Samples.Formats] FROM @"Microsoft.Analytics.Samples.Formats.dll";
// B. REFERENCE ASSEMBLY: Load assemblies for compile time and execution.
REFERENCE ASSEMBLY [Newtonsoft.Json];
REFERENCE ASSEMBLY [Microsoft.Analytics.Samples.Formats];
// C. USING: Specify namespace to shorten function names (e.g. Microsoft.Analytics.Samples.Formats.Json.JsonExtractor)
USING Microsoft.Analytics.Samples.Formats.Json;
// 1. Initialise variables for Input (e.g. JSON) and Output (e.g. CSV).
DECLARE @InputFile string = @"document_{*}.json";
DECLARE @OutputFile string = @"output.csv";
Challenge 2 - Multiple Levels
Fortunately, the JSON assembly includes a MultiLevelJsonExtractor which allows us to extract data from multiple JSON paths at differing levels within a single pass. See the underlying code and inline documentation over at Github.
The code snippet shows the MultiLevelJsonExtractor in action.
- The first parameter (rowpath) specifies the base path to start from.
- The second parameter (bypassWarning) is expecting a boolean value. True = If path isn't found: don't error, return null. False = If path isn't found: error.
- The third parameter (jsonPaths) is a list of JSON paths starting at the base path otherwise the extractor will recurse to the top of the tree to locate it.
// 2. Extract string content from JSON document using MultiLevelJsonExtractor (i.e. schema on read).
@json =
EXTRACT
restaurant_name string,
address string,
city string,
subzone string,
website string,
rating string,
cost string,
visited string,
bookmarks string,
reviews string,
cuisines string
FROM
@InputFile
USING new MultiLevelJsonExtractor("response.pageData.restaurants[*]",
true,
"basic_info.name",
"basic_info.restaurant_address",
"basic_info.city",
"basic_info.subzone_name",
"basic_info.website",
"basic_info.rating.display",
"cft_data.numeric_cft.text",
"basic_info.beenthere_count",
"basic_info.wishlisters_count",
"basic_info.user_review_count",
"cuisines"
);
Challenge 3 - Data Inconsistencies
In this example the cost attribute may not always contain a value (i.e. null), in this case we can leverage the COALESCE operator to essentially a perform the equivalent of: if null x else y.
// 3. Coalesce "cost" (i.e. if null, emtpy string).
@dict =
SELECT
restaurant_name,
address,
city,
subzone,
website,
rating,
cost ?? "" AS cost,
visited,
bookmarks,
reviews,
JsonFunctions.JsonTuple(cuisines).Values AS cuisine_array
FROM @json;
Challenge 4 - Aggregation
Our final challenge is consolidating our output to ensure there is only one row per restaurant. The problem, each restaurant is related to an array of cuisines. The snippet below shows how we can explode the values of the array and then aggregate into a single row using ARRAY_AGG.
// 4. Explode Cuisine Array
@explode =
SELECT
restaurant_name,
address,
city,
subzone,
website,
rating,
cost,
visited,
bookmarks,
reviews,
JsonFunctions.JsonTuple(cuisine)["name"] AS cuisine
FROM @dict
CROSS APPLY
EXPLODE(cuisine_array) AS c(cuisine);
// 5. Clean and Aggregate
@output =
SELECT
restaurant_name,
address,
city,
subzone,
website,
rating,
cost.Replace("\u00A3", "").Replace(" for two people (approx.)", "").Trim() AS cost,
visited,
bookmarks,
reviews,
string.Join(", ", ARRAY_AGG(cuisine)) AS cuisine
FROM @explode
GROUP BY restaurant_name, address, city, subzone, website, rating, cost, visited, bookmarks, reviews;
// 6. Write values to CSV
OUTPUT @output
TO @OutputFile
USING Outputters.Csv(outputHeader:true,quoting:true);
Complete Code
// A. CREATE ASSEMBLY: Register assemblies (if they do not already exist).
CREATE ASSEMBLY IF NOT EXISTS [Newtonsoft.Json] FROM @"Newtonsoft.Json.dll";
CREATE ASSEMBLY IF NOT EXISTS [Microsoft.Analytics.Samples.Formats] FROM @"Microsoft.Analytics.Samples.Formats.dll";
// B. REFERENCE ASSEMBLY: Load assemblies for compile time and execution.
REFERENCE ASSEMBLY [Newtonsoft.Json];
REFERENCE ASSEMBLY [Microsoft.Analytics.Samples.Formats];
// C. USING: Specify namespace to shorten function names (e.g. Microsoft.Analytics.Samples.Formats.Json.JsonExtractor)
USING Microsoft.Analytics.Samples.Formats.Json;
// 1. Initialise variables for Input (e.g. JSON) and Output (e.g. CSV).
DECLARE @InputFile string = @"document_{*}.json";
DECLARE @OutputFile string = @"output.csv";
// 2. Extract string content from JSON document using MultiLevelJsonExtractor (i.e. schema on read).
@json =
EXTRACT
restaurant_name string,
address string,
city string,
subzone string,
website string,
rating string,
cost string,
visited string,
bookmarks string,
reviews string,
cuisines string
FROM
@InputFile
USING new MultiLevelJsonExtractor("response.pageData.restaurants[*]",
true,
"basic_info.name",
"basic_info.restaurant_address",
"basic_info.city",
"basic_info.subzone_name",
"basic_info.website",
"basic_info.rating.display",
"cft_data.numeric_cft.text",
"basic_info.beenthere_count",
"basic_info.wishlisters_count",
"basic_info.user_review_count",
"cuisines"
);
// 3. Coalesce "cost" (i.e. if null, emtpy string).
@dict =
SELECT
restaurant_name,
address,
city,
subzone,
website,
rating,
cost ?? "" AS cost,
visited,
bookmarks,
reviews,
JsonFunctions.JsonTuple(cuisines).Values AS cuisine_array
FROM @json;
// 4. Explode Cuisine Array
@explode =
SELECT
restaurant_name,
address,
city,
subzone,
website,
rating,
cost,
visited,
bookmarks,
reviews,
JsonFunctions.JsonTuple(cuisine)["name"] AS cuisine
FROM @dict
CROSS APPLY
EXPLODE(cuisine_array) AS c(cuisine);
// 5. Clean and Aggregate
@output =
SELECT
restaurant_name,
address,
city,
subzone,
website,
rating,
cost.Replace("\u00A3", "").Replace(" for two people (approx.)", "").Trim() AS cost,
visited,
bookmarks,
reviews,
string.Join(", ", ARRAY_AGG(cuisine)) AS cuisine
FROM @explode
GROUP BY restaurant_name, address, city, subzone, website, rating, cost, visited, bookmarks, reviews;
// 6. Write values to CSV
OUTPUT @output
TO @OutputFile
USING Outputters.Csv(outputHeader:true,quoting:true);
Job Results
Job Graph
Insights
Highest Rated Restaurants in London (sorted by number of reviews)
# | Restaurant | Address | Area | Cost | Cuisine |
---|---|---|---|---|---|
1 | Duck & Waffle | Heron Tower, 110 Bishopsgate, EC2N 4AY | City of London | £55 | British |
2 | Dinner By Heston Blumenthal | Mandarin Oriental Hyde Park, SW1X 7LA | Knightsbridge | £130 | British |
3 | The Ledbury | 127 Ledbury Road, W11 2AQ | Notting Hill | £180 | French, British, European |
4 | Flat Iron | 17 Beak Street, W1F 9RW | Soho | £35 | Steak |
5 | Bao | 53 Lexington Street, W1F 9AS | Soho | £20 | Taiwanese, Street Food |
6 | Spicy Basil | 165 Kilburn High Road, NW6 7HY | Kilburn | £25 | Curry, Thai |
7 | Hoppers | 49 Frith Street, W1D 4SG | Soho | £40 | Indian, Sri Lankan |
8 | Dotori | 3 Stroud Green Road, N4 2DQ | Finsbury Park | £30 | Sushi, Korean, Japanese |
9 | Thai Terrace | 7th Floor, Castle Car Park, Sydenham | Guildford | N/A | Thai |