05 Functions

Download as pdf or txt
Download as pdf or txt
You are on page 1of 6

05_functions

December 18, 2023

0.1 Migrating from Spark to BigQuery via Dataproc – Part 5


• Part 1: The original Spark code, now running on Dataproc (lift-and-shift).
• Part 2: Replace HDFS by Google Cloud Storage. This enables job-specific-clusters. (cloud-
native)
• Part 3: Automate everything, so that we can run in a job-specific cluster. (cloud-optimized)
• Part 4: Load CSV into BigQuery, use BigQuery. (modernize)
• Part 5: Using Cloud Functions, launch analysis every time there is a new file in the bucket.
(serverless)

0.1.1 Catch-up cell


0.1.2 Create reporting function
[10]: %%writefile main.py

from google.cloud import bigquery


import google.cloud.storage as gcs
import tempfile
import os

def create_report(BUCKET, gcsfilename, tmpdir):


"""
Creates report in gs://BUCKET/ based on contents in gcsfilename (gs://
↪bucket/some/dir/filename)

"""
# connect to BigQuery
client = bigquery.Client()
destination_table = client.get_table('sparktobq.kdd_cup')

# Specify table schema. Autodetect is not a good idea for production code
job_config = bigquery.LoadJobConfig()
schema = [
bigquery.SchemaField("duration", "INT64"),
]
for name in ['protocol_type', 'service', 'flag']:
schema.append(bigquery.SchemaField(name, "STRING"))
for name in␣
↪'src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins'.split(','):

1
schema.append(bigquery.SchemaField(name, "INT64"))
schema.append(bigquery.SchemaField("unused_10", "STRING"))
schema.append(bigquery.SchemaField("num_compromised", "INT64"))
schema.append(bigquery.SchemaField("unused_12", "STRING"))
for name in 'su_attempted,num_root,num_file_creations'.split(','):
schema.append(bigquery.SchemaField(name, "INT64"))
for fieldno in range(16, 41):
schema.append(bigquery.SchemaField("unused_{}".format(fieldno),␣
↪"STRING"))

schema.append(bigquery.SchemaField("label", "STRING"))
job_config.schema = schema

# Load CSV data into BigQuery, replacing any rows that were there before
job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED
job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
job_config.skip_leading_rows = 0
job_config.source_format = bigquery.SourceFormat.CSV
load_job = client.load_table_from_uri(gcsfilename, destination_table,␣
↪job_config=job_config)

print("Starting LOAD job {} for {}".format(load_job.job_id, gcsfilename))


load_job.result() # Waits for table load to complete.
print("Finished LOAD job {}".format(load_job.job_id))

# connections by protocol
sql = """
SELECT COUNT(*) AS count
FROM sparktobq.kdd_cup
GROUP BY protocol_type
ORDER by count ASC
"""
connections_by_protocol = client.query(sql).to_dataframe()
connections_by_protocol.to_csv(os.path.join(tmpdir,"connections_by_protocol.
↪csv"))

print("Finished analyzing connections")

# attacks plot
sql = """
SELECT
protocol_type,
CASE label
WHEN 'normal.' THEN 'no attack'
ELSE 'attack'
END AS state,
COUNT(*) as total_freq,
ROUND(AVG(src_bytes), 2) as mean_src_bytes,
ROUND(AVG(dst_bytes), 2) as mean_dst_bytes,
ROUND(AVG(duration), 2) as mean_duration,

2
SUM(num_failed_logins) as total_failed_logins,
SUM(num_compromised) as total_compromised,
SUM(num_file_creations) as total_file_creations,
SUM(su_attempted) as total_root_attempts,
SUM(num_root) as total_root_acceses
FROM sparktobq.kdd_cup
GROUP BY protocol_type, state
ORDER BY 3 DESC
"""
attack_stats = client.query(sql).to_dataframe()
ax = attack_stats.plot.bar(x='protocol_type', subplots=True,␣
↪figsize=(10,25))

ax[0].get_figure().savefig(os.path.join(tmpdir,'report.png'));
print("Finished analyzing attacks")

bucket = gcs.Client().get_bucket(BUCKET)
for blob in bucket.list_blobs(prefix='sparktobq/'):
blob.delete()
for fname in ['report.png', 'connections_by_protocol.csv']:
bucket.blob('sparktobq/{}'.format(fname)).upload_from_filename(os.path.
↪join(tmpdir,fname))

print("Uploaded report based on {} to {}".format(gcsfilename, BUCKET))

def bigquery_analysis_cf(data, context):


# check that trigger is for a file of interest
bucket = data['bucket']
name = data['name']
if ('kddcup' in name) and not ('gz' in name):
filename = 'gs://{}/{}'.format(bucket, data['name'])
print(bucket, filename)
with tempfile.TemporaryDirectory() as tmpdir:
create_report(bucket, filename, tmpdir)

Overwriting main.py

[11]: %%writefile requirements.txt


google-cloud-bigquery
google-cloud-storage
pandas
matplotlib

Overwriting requirements.txt

[12]: # verify that the code in the CF works


name='kddcup.data_10_percent'
if 'kddcup' in name and not ('gz' in name):
print(True)

3
True

0.2 Test that the function endpoint works


[13]: # test that the function works
import main as bq

BUCKET='qwiklabs-gcp-00-0bb736ec2d40' # CHANGE
try:
bq.create_report(BUCKET, 'gs://{}/kddcup.data_10_percent'.format(BUCKET), "/
↪tmp")

except Exception as e:
print(e.errors)

Starting LOAD job a90a8510-a593-45e1-b42a-1ec57c06eb4d for gs://qwiklabs-


gcp-00-0bb736ec2d40/kddcup.data_10_percent
Finished LOAD job a90a8510-a593-45e1-b42a-1ec57c06eb4d
Finished analyzing connections
Finished analyzing attacks
Uploaded report based on gs://qwiklabs-
gcp-00-0bb736ec2d40/kddcup.data_10_percent to qwiklabs-gcp-00-0bb736ec2d40

4
5
0.3 Deploy the cloud function
[18]: !gcloud functions deploy bigquery_analysis_cf --runtime python37␣
↪--trigger-resource $BUCKET --trigger-event google.storage.object.finalize

ERROR: (gcloud.functions.deploy) Error building source archive from


path [.]. Could not validate source files: [[Errno 2] No such file or directory:
'./proc/2/exe']. Please ensure that path [.] contains function code or specify
another directory with --source

0.4 Try it out


Copy the file to the bucket:

[15]: !gsutil rm -rf gs://$BUCKET/sparktobq


!gsutil cp kddcup.data_10_percent gs://$BUCKET/

Removing gs://qwiklabs-
gcp-00-0bb736ec2d40/sparktobq/connections_by_protocol.csv#1702863986507885…
Removing gs://qwiklabs-
gcp-00-0bb736ec2d40/sparktobq/report.png#1702863986385588…
/ [2 objects]
Operation completed over 2 objects.
Copying file://kddcup.data_10_percent [Content-Type=application/octet-stream]…
- [1 files][ 71.4 MiB/ 71.4 MiB]
Operation completed over 1 objects/71.4 MiB.
Verify that the Cloud Function is being run. You can do this from the Cloud Functions part of the
GCP Console.
Once the function is complete (in about 30 seconds), see if the output folder contains the report:

[16]: !gsutil ls gs://$BUCKET/sparktobq

CommandException: One or more URLs matched no objects.


Copyright 2019 Google Inc. Licensed under the Apache License, Version 2.0 (the “License”); you
may not use this file except in compliance with the License. You may obtain a copy of the License
at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to
in writing, software distributed under the License is distributed on an “AS IS” BASIS, WITHOUT
WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for
the specific language governing permissions and limitations under the License.

You might also like