Google Cloud Certified Professional Data Engineer

  • Dataflow Lab

  • Create a Dataflow pipeline using Python

Reference

export RANDOMID=$RANDOM
export PROJECT_ID=data-flow-$RANDOMID
export EMAIL_ADDRESS=learnpde@gmail.com
export BUCKET_NAME=dataflow-wordcount-example-$RANDOMID
export DATAFLOW_REGION=us-central1
echo $PROJECT_ID
echo $BUCKET_NAME

1 - Set Env Variables

gcloud projects create $PROJECT_ID

2 - Create Project

gcloud config set project $PROJECT_ID

3 - Set Project

export PROJECT_NUMBER=$(gcloud projects list --filter=$PROJECT_ID --format="value(PROJECT_NUMBER)")

4 - Set Project Number

echo $PROJECT_NUMBER

5 - Verify Project Number

gcloud services enable dataflow compute_component logging storage_component storage_api \
bigquery pubsub datastore.googleapis.com cloudresourcemanager.googleapis.com

6 - Enable Services

gcloud auth application-default login

7 - Auth Login

gcloud projects add-iam-policy-binding $PROJECT_ID \
--member="user:${EMAIL_ADDRESS}" --role=roles/iam.serviceAccountUser

8 - Add IAM Policy Binding

gsutil mb -c STANDARD -l US gs://$BUCKET_NAME

9 - Make a Bucket

gcloud projects add-iam-policy-binding $PROJECT_ID \
--member="serviceAccount:${PROJECT_NUMBER}-compute@developer.gserviceaccount.com" \
--role=roles/dataflow.admin

gcloud projects add-iam-policy-binding $PROJECT_ID \
--member="serviceAccount:${PROJECT_NUMBER}-compute@developer.gserviceaccount.com" \
--role=roles/dataflow.worker

gcloud projects add-iam-policy-binding $PROJECT_ID \
--member="serviceAccount:${PROJECT_NUMBER}-compute@developer.gserviceaccount.com" \
--role=roles/storage.objectAdmin

10 - Add Necessary Roles

python --version
python -m pip --version

11 - Check Python, Pip Version

mkdir dataflow-$RANDOMID
cd dataflow-$RANDOMID
python -m venv env

12 - CD to Project Folder

source env/bin/activate

13 - Activate Environment

pip install wheel
pip install 'apache-beam[gcp]'

14 - Install Dependencies

python -m apache_beam.examples.wordcount \
  --output outputs

15 - Run Program Locally

more outputs*

16 - Check Outputs

python -m apache_beam.examples.wordcount \
    --region $DATAFLOW_REGION \
    --input gs://dataflow-samples/shakespeare/kinglear.txt \
    --output gs://${BUCKET_NAME}/results/outputs \
    --runner DataflowRunner \
    --project $PROJECT_ID \
    --temp_location gs://${BUCKET_NAME}/tmp/

17 - Run Program on DataflowRunner

gcloud projects delete $PROJECT_ID

18 - Delete Project

THANKS

FOR

WATCHING