Google Cloud Certified Professional Data Engineer
-
Dataflow Lab
-
Create a Dataflow pipeline using Python
Reference
export RANDOMID=$RANDOM
export PROJECT_ID=data-flow-$RANDOMID
export EMAIL_ADDRESS=learnpde@gmail.com
export BUCKET_NAME=dataflow-wordcount-example-$RANDOMID
export DATAFLOW_REGION=us-central1
echo $PROJECT_ID
echo $BUCKET_NAME1 - Set Env Variables
gcloud projects create $PROJECT_ID2 - Create Project
gcloud config set project $PROJECT_ID3 - Set Project
export PROJECT_NUMBER=$(gcloud projects list --filter=$PROJECT_ID --format="value(PROJECT_NUMBER)")4 - Set Project Number
echo $PROJECT_NUMBER5 - Verify Project Number
gcloud services enable dataflow compute_component logging storage_component storage_api \
bigquery pubsub datastore.googleapis.com cloudresourcemanager.googleapis.com6 - Enable Services
gcloud auth application-default login7 - Auth Login
gcloud projects add-iam-policy-binding $PROJECT_ID \
--member="user:${EMAIL_ADDRESS}" --role=roles/iam.serviceAccountUser8 - Add IAM Policy Binding
gsutil mb -c STANDARD -l US gs://$BUCKET_NAME9 - Make a Bucket
gcloud projects add-iam-policy-binding $PROJECT_ID \
--member="serviceAccount:${PROJECT_NUMBER}-compute@developer.gserviceaccount.com" \
--role=roles/dataflow.admin
gcloud projects add-iam-policy-binding $PROJECT_ID \
--member="serviceAccount:${PROJECT_NUMBER}-compute@developer.gserviceaccount.com" \
--role=roles/dataflow.worker
gcloud projects add-iam-policy-binding $PROJECT_ID \
--member="serviceAccount:${PROJECT_NUMBER}-compute@developer.gserviceaccount.com" \
--role=roles/storage.objectAdmin10 - Add Necessary Roles
python --version
python -m pip --version11 - Check Python, Pip Version
mkdir dataflow-$RANDOMID
cd dataflow-$RANDOMID
python -m venv env12 - CD to Project Folder
source env/bin/activate13 - Activate Environment
pip install wheel
pip install 'apache-beam[gcp]'14 - Install Dependencies
python -m apache_beam.examples.wordcount \
--output outputs15 - Run Program Locally
more outputs*16 - Check Outputs
python -m apache_beam.examples.wordcount \
--region $DATAFLOW_REGION \
--input gs://dataflow-samples/shakespeare/kinglear.txt \
--output gs://${BUCKET_NAME}/results/outputs \
--runner DataflowRunner \
--project $PROJECT_ID \
--temp_location gs://${BUCKET_NAME}/tmp/17 - Run Program on DataflowRunner
gcloud projects delete $PROJECT_ID18 - Delete Project
THANKS
FOR
WATCHING
Google Cloud Data Engineer - Dataflow Lab 1
By Deepak Dubey
Google Cloud Data Engineer - Dataflow Lab 1
- 126