BigQuery

BigQuery is an OLAP engine. It performs better on denormalized tables with nested and repeated data.

τ=dom∣⟨A1:τ[∗∣?],...,An:τ[∗∣?]i⟩∣τ = dom | \langle A_1 : τ [∗|?], . . . , A_n : τ [∗|?]i \rangle |

Dremel - https://storage.googleapis.com/pub-tools-public-publication-data/pdf/36632.pdf

Finite State Machine
Record Assembly
System architecture
Dissecting a record

Query Execution detail

Operation

Code plan

READ

$3:elapsed_time_sec, $4:event_type, $1:away_division_alias, $2:home_division_aliasFROM bigquery-public-data.ncaa_basketball.mbb_pbp_srWHERE and(equal($2, 'D1'), equal($1, 'D1'))

AGGREGATE

$30 := COUNTIF($40)$31 := COUNTIF($41)$32 := COUNTIF($42)$33 := COUNTIF($43)

COMPUTE

$40 := and(equal($4, 'threepointmade'), less($3, 2100))$41 := and(or(equal($4, 'threepointmade'), equal($4, 'threepointmiss')), less($3, 2100))$42 := and(equal($4, 'threepointmade'), greater_or_equal($3, 2100))$43 := and(or(equal($4, 'threepointmade'), equal($4, 'threepointmiss')), greater_or_equal($3, 2100))

WRITE

$30, $31, $32, $33TO __stage00_output

Working query

WITH double_entry_book AS (
    --debits
    SELECT
    array_to_string(inputs.addresses, ",") as address
    , inputs.type
    , -inputs.value as value

    FROM `bigquery-public-data.crypto_bitcoin.inputs` as inputs
    UNION ALL 

    -- credits
    SELECT 
     array_to_string(outputs.addresses, ",") as address
     , outputs.type
     , outputs.value as value

    FROM `bigquery-public-data.crypto_bitcoin.outputs` as outputs
)

SELECT 
address 
, type
, sum(value) as balance
FROM double_entry_book 
GROUP BY 1, 2
ORDER BY balance DESC
LIMIT 100

Importing data from Kafka

Run the imports and batch loads from Kafka into BigQuery through Apache Beam

Arrays

.https://cloud.google.com/bigquery/docs/reference/standard-sql/arrays

.https://google.qwiklabs.com/focuses/3638 - Troubleshooting Join pitfalls

WITH races AS (
 SELECT "800M" AS race,
   [STRUCT("Rudisha" as name, [23.4, 26.3, 26.4, 26.1] as splits),
    STRUCT("Makhloufi" as name, [24.5, 25.4, 26.6, 26.1] as splits),
    STRUCT("Murphy" as name, [23.9, 26.0, 27.0, 26.0] as splits),
    STRUCT("Bosse" as name, [23.6, 26.2, 26.5, 27.1] as splits),
    STRUCT("Rotich" as name, [24.7, 25.6, 26.9, 26.4] as splits),
    STRUCT("Lewandowski" as name, [25.0, 25.7, 26.3, 27.2] as splits),
    STRUCT("Kipketer" as name, [23.2, 26.1, 27.3, 29.4] as splits),
    STRUCT("Berian" as name, [23.7, 26.1, 27.0, 29.3] as splits)]
    AS participants)
SELECT
race,
(SELECT name
 FROM UNNEST(participants),
   UNNEST(splits) AS duration
 ORDER BY duration ASC LIMIT 1) AS runner_with_fastest_lap
FROM races;

+------+-------------------------+
| race | runner_with_fastest_lap |
+------+-------------------------+
| 800M | Kipketer                |
+------+-------------------------+

Partition

#standardSQL
 CREATE OR REPLACE TABLE ecommerce.days_with_rain
 PARTITION BY date
 OPTIONS (
   partition_expiration_days=60,
   description="weather stations with precipitation, partitioned by day"
 ) AS


 SELECT
   DATE(CAST(year AS INT64), CAST(mo AS INT64), CAST(da AS INT64)) AS date,
   (SELECT ANY_VALUE(name) FROM `bigquery-public-data.noaa_gsod.stations` AS stations
    WHERE stations.usaf = stn) AS station_name,  -- Stations may have multiple names
   prcp
 FROM `bigquery-public-data.noaa_gsod.gsod*` AS weather
 WHERE prcp < 99.9  -- Filter unknown values
   AND prcp > 0      -- Filter
   AND CAST(_TABLE_SUFFIX AS int64) >= 2018

BQML

.https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-predict

GIS

.https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions

External data sources

.https://cloud.google.com/bigquery/external-data-sources

Can query data directly from

  • Cloud Bigtable

  • Cloud Storage

  • Google Drive

  • Cloud SQL

Supported formats Avro, CSV, JSON (newline delimited), ORC, Parquet

Location of dataset

.https://cloud.google.com/bigquery/docs/locations

Query execution patterns

Statistics

.https://cloud.google.com/bigquery/docs/reference/standard-sql/statistical_aggregate_functions

Metrics

.https://console.cloud.google.com/apis/api/bigquery.googleapis.com/metrics?project=

Labs

name

quest link

BigQuery for data warehousing

More

Long running Queries

.https://cloud.google.com/service-infrastructure/docs/polling-operations

Structs

.https://cloud.google.com/spanner/docs/structs

void FieldAccessOnStructParameters(google::cloud::spanner::Client client) {
  namespace spanner = ::google::cloud::spanner;

  // Cloud Spanner STRUCT<> with named fields is represented as
  // tuple<pair<string, T>...>. Create a type alias for this example:
  using SingerName = std::tuple<std::pair<std::string, std::string>,
                                std::pair<std::string, std::string>>;
  SingerName name({"FirstName", "Elena"}, {"LastName", "Campbell"});

  auto rows = client.ExecuteQuery(spanner::SqlStatement(
      "SELECT SingerId FROM Singers WHERE FirstName = @name.FirstName",
      {{"name", spanner::Value(name)}}));

  for (auto const& row : spanner::StreamOf<std::tuple<std::int64_t>>(rows)) {
    if (!row) throw std::runtime_error(row.status().message());
    std::cout << "SingerId: " << std::get<0>(*row) << "\n";
  }
  std::cout << "Query completed for"
            << " [spanner_field_access_on_struct_parameters]\n";
}

WHERE vs HAVING

SELECT
  ORIGIN,
  AVG(DEP_DELAY) AS dep_delay,
  AVG(ARR_DELAY) AS arr_delay,
  COUNT(ARR_DELAY) AS num_flights
FROM
  `flights.tzcorr`
GROUP BY
  ORIGIN
HAVING
  num_flights > 3650
ORDER BY
  dep_delay DESC

Last updated

Was this helpful?