[データ分析100本ノック] 第8章 数値シミュレーションで消費者行動を予測する10本ノック

Published : 2020-05-30   Lastmod : 2021-11-07

## 第8章 数値シミュレーションで消費者行動を予測する10本ノック

この記事は「Python実践データ分析100本ノック」 の演習を実際にやってみたという内容になっています。今まで自己流でやってきましたが、一度他の方々がどのような考え方やコーディングをしているのか勉強してみようと思ってやってみました。本書は実際の業務に活用する上でとても参考になる内容だと思っています。データ分析に関わる仕事をしたい方にお勧めしたいです。

• 数値計算
• 数値シミュレーション

### github

• jupyter notebook形式のファイルはこちら

### 筆者の環境

!sw_vers

ProductName:	Mac OS X
ProductVersion:	10.14.6
BuildVersion:	18G95

!python -V

Python 3.5.5 :: Anaconda, Inc.


%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import matplotlib
import matplotlib.pyplot as plt
import scipy
import numpy as np
import pandas as pd

print('matplotlib version :', matplotlib.__version__)
print('scipy version :', scipy.__version__)
print('numpy version :', np.__version__)
print('pandas version :', pd.__version__)

matplotlib version : 2.2.2
scipy version : 1.4.1
numpy version : 1.18.1
pandas version : 0.24.2


## 解答

### ノック 71 : 人間関係のネットワークを可視化してみよう

%%bash

,Node0,Node1,Node2,Node3,Node4,Node5,Node6,Node7,Node8,Node9,Node10,Node11,Node12,Node13,Node14,Node15,Node16,Node17,Node18,Node19
Node0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Node1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
Node2,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Node3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Node4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Node5,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
Node6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Node7,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Node8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0

df_links = pd.read_csv('links.csv')


Unnamed: 0Node0Node1Node2Node3Node4Node5Node6Node7Node8...Node10Node11Node12Node13Node14Node15Node16Node17Node18Node19
0Node00.00.00.00.00.01.00.00.00.0...0.00.00.00.00.01.00.00.00.00.0
1Node10.00.00.00.00.01.00.00.00.0...0.01.00.01.00.00.01.00.00.00.0
2Node20.00.00.00.01.01.01.00.00.0...1.00.00.00.00.00.00.00.00.00.0
3Node30.00.00.00.00.00.00.01.00.0...0.00.00.00.00.01.00.00.00.00.0
4Node40.00.01.00.00.00.00.01.01.0...1.00.00.00.00.01.00.00.00.00.0

5 rows × 21 columns

import networkx as nx

G = nx.Graph()

# 頂点の設定

for i in range(1, num):

# 辺の設定
for i in range(num):
for j in range(num):
# 関係性がある1の場合のみ辺を追加

nx.draw_networkx(G, node_color='r', edge_color='k', font_color='w')
plt.show()


ここではnx.drawではなく、nx.draw_networkxメソッドを利用して関係図を可視化しています。これはリンクが多いものが自動的に中心に集められるようになっています。再現性はなく、実行毎に配置が異なるようです。

### ノック 72 : 口コミによる情報伝播の様子を可視化してみよう

def determine_link(percent):
rand_val = np.random.rand()
if rand_val <= percent:
return 1
return 0

def simulate_percolation(num, list_active, percent_percolation):
for i in range(num):
if list_active[i] == 1:
for j in range(num):
list_active[j] = 1
return list_active

percent_percolation = 0.1
T_NUM = 100
list_active = np.zeros(NUM)
list_active[0] = 1
list_timeSeries = []

for t in range(T_NUM):
list_active = simulate_percolation(NUM, list_active, percent_percolation)
list_timeSeries.append(list_active.copy())

def active_node_coloring(list_active):
list_color = []
for i in range(len(list_timeSeries[t])):
if list_timeSeries[t][i] == 1:
list_color.append('r')
else:
list_color.append('k')
return list_color

t = 0
nx.draw_networkx(G, font_color='w', node_color=active_node_coloring(list_timeSeries[t]))
plt.show()

t = 10
nx.draw_networkx(G, font_color='w', node_color=active_node_coloring(list_timeSeries[t]))
plt.show()

t = 99
nx.draw_networkx(G, font_color='w', node_color=active_node_coloring(list_timeSeries[t]))
plt.show()


### ノック 73 : 口コミ数の時系列変化をグラフ化してみよう

list_timeSeries_num =[]
for i in range(len(list_timeSeries)):
list_timeSeries_num.append(sum(list_timeSeries[i]))

plt.plot(list_timeSeries_num)
plt.grid()
plt.show()


### ノック 74 : 会員数の時系列変化をシミュレーションしみてよう

def simulate_population(num, list_active, percent_percolation, percent_disapparence, df_links):
## 拡散
for i in range(num):
if list_active[i] == 1:
for j in range(num):
list_active[j] = 1

for i in range(num):
list_active[i] = 0

return list_active

percent_percolationpercolation = 0.1
percent_disapparence = 0.05
T_NUM = 100
list_active = np.zeros(NUM)

list_active[0] = 1

list_timeSeries = []

for t in range(T_NUM):
list_active = simulate_population(NUM, list_active, percent_percolation, percent_disapparence, df_links)
list_timeSeries.append(list_active.copy())

list_timeSeries_num = []
for i in range(len(list_timeSeries)):
list_timeSeries_num.append(sum(list_timeSeries[i]))

plt.plot(list_timeSeries_num)
plt.grid()
plt.show()


percent_disapparence = 0.2
T_NUM = 100
list_active = np.zeros(NUM)
list_active[0] = 1
list_timeSeries = []

for t in range(T_NUM):
list_active = simulate_population(NUM, list_active, percent_percolation, percent_disapparence, df_links)
list_timeSeries.append(list_active.copy())

list_timeSeries_num = []
for i in range(len(list_timeSeries)):
list_timeSeries_num.append(sum(list_timeSeries[i]))

plt.plot(list_timeSeries_num)
plt.grid()
plt.show()


### ノック 75 : パラメタの全体像を、相図を見ながら把握しよう

# T_NUM = 100
T_NUM = 1
NUM_PhaseDiagram = 20
phaseDiagram = np.zeros((NUM_PhaseDiagram, NUM_PhaseDiagram))

for i_p in range(NUM_PhaseDiagram):
for i_d in range(NUM_PhaseDiagram):
percent_percolation = 0.05 * i_p
percent_disapparence = 0.05 * i_d
list_active = np.zeros(NUM)
list_active[0] = 1

for t in range(T_NUM):
list_active = simulate_population(NUM, list_active, percent_percolation, percent_disapparence, df_links)

phaseDiagram[i_p][i_d] = sum(list_active)

print('END')

END

plt.matshow(phaseDiagram)
plt.colorbar(shrink=0.8)
plt.xlabel('percent_disapparence')
plt.ylabel('percent_percolation')
plt.xticks(np.arange(0.0, 20.0, 5), np.arange(0.0, 1.0, 0.25))
plt.yticks(np.arange(0.0, 20.0, 5), np.arange(0.0, 1.0, 0.25))

plt.tick_params(bottom=False, left=False, right=False, top=False)
plt.show()


### ノック 76 : 実データを読み込んでみよう

スポーツジムの実際の状況を読み込みます。

df_mem_links = pd.read_csv('links_members.csv')

df_mem_links.head()


Unnamed: 0Node0Node1Node2Node3Node4Node5Node6Node7Node8...Node530Node531Node532Node533Node534Node535Node536Node537Node538Node539
0Node00.00.01.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
1Node10.00.00.00.01.01.00.00.00.0...0.01.01.00.00.00.00.00.00.00.0
2Node21.00.00.00.00.01.00.01.00.0...0.00.01.00.01.00.00.01.01.00.0
3Node30.00.00.00.00.00.01.00.00.0...0.00.00.00.00.00.00.00.00.00.0
4Node40.01.00.00.00.01.00.00.00.0...0.00.00.01.01.00.00.00.01.00.0

5 rows × 541 columns

df_mem_info.head()


Unnamed: 0012345678...14151617181920212223
0Node01.01.01.01.01.01.01.00.00.0...1.01.01.01.01.01.01.00.00.00.0
1Node10.00.00.00.00.01.01.01.01.0...1.01.01.01.01.01.01.01.01.01.0
2Node20.00.00.00.00.00.01.01.01.0...1.01.01.01.00.01.01.01.01.01.0
3Node30.00.00.00.00.01.01.01.01.0...1.01.01.01.01.00.01.01.01.00.0
4Node40.00.00.00.00.00.01.01.01.0...1.01.01.01.01.01.01.01.01.01.0

5 rows × 25 columns

### ノック 77 : リンク数の分布を可視化しよう

リンク数の分布を見るためにヒストグラム表示して見ます。

NUM = len(df_mem_links.index)
for i in range(NUM):

plt.hist(array_linkNum, bins=10, range=(0,250))
plt.grid()
plt.show()


### ノック 78 : シミュレーションのために実データからパラメタを推定しよう

NUM = len(df_mem_info.index)
T_NUM = len(df_mem_info.columns) - 1

count_active = 0
count_active_to_inactive = 0

for t in range(1, T_NUM):
for i in range(NUM):
if (df_mem_info.iloc[i][t] == 1):
count_active_to_inactive += 1
if df_mem_info.iloc[i][t + 1] == 0:
count_active += 1

estimated_percent_disapprence = count_active / count_active_to_inactive

count_link = 0

for t in range(T_NUM - 1):
temp_flag_count = np.zeros(NUM)

if df_mem_info.iloc[df_link_temp.index[j]][t + 1] == 1:



### ノック 79 : 実データとシミュレーションを比較しよう

percent_percolation = 0.02518
percent_disapparence = 0.10147

T_NUM = 24
list_active = np.zeros(NUM)

list_active[0] = 1
list_timeSeries = []

for t in range(T_NUM):
list_active = simulate_population(NUM, list_active, percent_percolation, percent_disapparence, df_mem_links)
list_timeSeries.append(list_active.copy())

list_timeSeries_num = []
for i in range(len(list_timeSeries)):
list_timeSeries_num.append(sum(list_timeSeries[i]))

T_NUM = len(df_mem_info.columns) - 1
list_timeSeries_num_real = []
for t in range(0, T_NUM):
list_timeSeries_num_real.append(len(df_mem_info[df_mem_info[str(t)] == 1].index))

plt.plot(list_timeSeries_num, label = 'simulated')
plt.plot(list_timeSeries_num_real, label = 'real')
plt.xlabel('month')
plt.ylabel('population')
plt.legend(loc='lower right')
plt.grid()
plt.show()


### ノック 80 : シミュレーションによる将来予測を実施しよう

T_NUM = 36