GA4データをBigQueryに入れて、そこからSQLやPythonで分析していく内容を共有します。
1. 管理画面からBigQueryの画面へ移行
2. GA4のサンプルデータセットを追加
3. SQLで集計する
✔️ 購入イベントの一覧を見る
select *
from `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_20210131`
where event_name = "purchase"
✔️ UNNESTで分解して確認する
select params
from `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_20210131`, unnest(event_params) as params
✔️ 日別でユニークユーザー数を確認
select
event_date,
count(distinct user_pseudo_id) as UU
from `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_202101*`
group by event_date
order by event_date
✔️ Session IDとページロケーションの一覧を取得
select
event_date,
(select value.int_value from unnest(event_params) where key = 'ga_session_id'),
(select value.string_value from unnest(event_params) where key = 'page_location')
from `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_202101*`
✔️ 日別のSession数
with session_id_tb as (
select
event_date,
user_pseudo_id,
(select value.int_value from unnest(event_params) where key = "ga_session_id") as ga_session_id
from
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_202101*`
)
select
event_date,
count(distinct concat(user_pseudo_id, ga_session_id)) session_num
from
session_id_tb
group by event_date
order by event_date
✔️ LTVを確認する
select
user_pseudo_id,
sum(user_ltv.revenue) as sum_amt
from
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_20210131`
group by user_pseudo_id
order by sum_amt desc
デバイス別の訪問数を確認
select
device.category,
count(*) cnt,
count(*) / sum(count(*)) over() as share
from
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_20210131`
group by device.category
order by cnt desc
国別の訪問数を確認
select
geo.country,
count(*) cnt,
count(*) / sum(count(*)) over() as share
from
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_20210131`
group by geo.country
order by cnt desc
トラフィックソース(チャンネル)別の訪問数を確認
select
traffic_source.medium,
count(*) cnt,
count(*) / sum(count(*)) over() as share
from
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_20210131`
group by traffic_source.medium
order by cnt desc
ユーザーID毎のデータを取得
with user_tb as(
select
event_date,
event_name,
user_pseudo_id,
(select value.int_value from unnest(event_params) where key = "ga_session_id") as ga_session_id
from
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_202101*`
)
select
user_pseudo_id,
count(distinct concat(user_pseudo_id, ga_session_id)) as session_num,
countif(event_name = "page_view") as pv_num,
countif(event_name = "purchase") as purchase_num,
from
user_tb
group by user_pseudo_id
order by session_num desc
自分のプロジェクトにデータセットを移行する
💡
地域は東京にした方が良いです。
create or replace table 'test-420302.test2.access_log_ga4' as
with user_tb as(
select
event_date,
event_name,
user_pseudo_id,
(select value.int_value from unnest(event_params) where key = "ga_session_id") as ga_session_id
from
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_202101*`
)
select
user_pseudo_id,
count(distinct concat(user_pseudo_id, ga_session_id)) as session_num,
countif(event_name = "page_view") as pv_num,
countif(event_name = "purchase") as purchase_num,
from
user_tb
group by user_pseudo_id
order by session_num desc
Pythonで分析する
Google ColabからBigQueryデータを読み込む
from google.colab import auth
auth.authenticate_user()
ライブラリをインポート
from google.cloud import bigquery
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
プロジェクトを読み込んでデータフレームを作成
project_id = "test-420302"
client = bigquery.Client(project=project_id)
query= """
select *
from `test-420302.test.access_log_ga4`
"""
df = client.query(query).to_dataframe()
df.head()
ユーザーIDを削除
df = df.drop("user_pseudo_id", axis=1)
df.head()
基本的なクラスタ分析をする
## クラスタ分析
km = KMeans(n_clusters=3)
cluster = km.fit_predict(df)
df["cluster"] = cluster
df.head()
クラスタ分析の集計をする
pd.concat([df.groupby("cluster").mean(),df["cluster"].value_counts()], axis=1)
クラスタ分析に最適な分類グループ数を探す
sse_list = []
for i in range(1,10):
km = KMeans(n_clusters=i)
cluster = km.fit_predict(df)
sse = km.inertia_
sse_list.append(sse)
plt.figure(figsize=(10,5))
plt.plot(range(1,10), sse_list)
plt.show()
最適なグループ数(5)で再度クラスタ分析
## クラスタ分析
km = KMeans(n_clusters=5)
cluster = km.fit_predict(df)
df["cluster"] = cluster
pd.concat([df.groupby("cluster").mean(),df["cluster"].value_counts()], axis=1)