%%capture
%config InlineBackend.figure_format = 'svg'
%matplotlib inline
import scipy as sc, pandas as pd, seaborn as sns
import gpflow as gp
import numpy as np
import matplotlib.pyplot as plt
sns.set()
Following are some visualizations and tables based on the data gathered in the march survey here. The raw data can be accessed here in the form of a .csv file.
The plots were made by mapping kyu ranks to negative integers, so that 1d corresponds to 0 (i.e. 1k -> -1, 2d -> 1). The tables are based on OGS ranks. They range from 15k to 7d as that's where almost all the responses lie.
data = pd.read_csv('Go Rank Survey March 2018.csv')
data.head()
data.info()
def mapping(x):
try:
t = x[-1]
except:
return x
if t == 'k':
n = -int(x[:-1])
else:
n = int(x[:-1])-1
return n
X = data.iloc[:,1:].copy()
X = X.applymap(mapping)
X=X.iloc[:,((X.shape[0]-X.isna().sum())>5).values]
X.dropna().shape
# outliers
X.drop([164,95,], inplace=True)
%%capture --no-display
sns.pairplot(X.iloc[:,((X.shape[0]-X.isna().sum())>5*3).values], diag_kind='kde', kind='reg');
def plot(m, X, Y, lo=-20, hi=10, xlabel='X', ylabel='Y', sigma=2):
lo = X.min()-abs(X.min()*.5/2)
hi = X.max()+abs(X.max()*.5)
xx = np.linspace(lo, hi, 1000)[:,None]
mean, var = m.predict_y(xx)
p=plt.figure()#figsize=(12, 6))
plt.plot(X, Y, 'kx', mew=2)
plt.plot(xx, mean, 'b', lw=2)
plt.fill_between(xx[:,0], mean[:,0] - sigma*np.sqrt(var[:,0]), mean[:,0] + sigma*np.sqrt(var[:,0]), color='blue', alpha=0.2)
plt.xlim(lo, hi)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
return p
#plot(m, x.T[0], x.T[1])
#sns.regplot(x.T[0], x.T[1])
def get_ranks_on_server(rank_start=-15, rank_end=7, fro='OGS', to='Tygem', lo=-20, hi=10, k=gp.kernels.RBF, meanf=gp.mean_functions.Zero, prior=(10,1e9), sigma=2):
x=X[[fro,to,]].dropna().values
ranks = sc.arange(rank_start, rank_end)
with gp.defer_build():
m=gp.models.GPR(X=x.T[0].reshape(-1, 1), Y=x.T[1].reshape(-1, 1), kern=k(1), mean_function=meanf())
m.kern.lengthscales.prior = gp.priors.Gaussian(*prior)
m.compile()
gp.train.ScipyOptimizer(tol=1e-7).minimize(m)
print('\n'+to)
print(m.kern.lengthscales)
mean, var = m.predict_y(ranks[:,None])
return mean,(mean[:,0] - sigma*np.sqrt(var[:,0]), mean[:,0] + sigma*np.sqrt(var[:,0])), plot(m, x.T[0], x.T[1], lo=lo, hi=hi, xlabel=fro, ylabel=to, sigma=sigma)
def fillna(a='OGS', b='KGS'):
with gp.defer_build():
m=gp.models.GPR(X=X[[a,b]].dropna()[a].values[:,None], Y=X[[a,b]].dropna()[b].values[:,None], kern=gp.kernels.RBF(1), mean_function=gp.mean_functions.Linear())
#m.kern.lengthscales.prior = gp.priors.Gaussian(10,10)
m.compile()
gp.train.ScipyOptimizer().minimize(m)
xx=X[X[a].notna()&X[b].isna()][a].values[:,None]
yy=m.predict_y(xx)[0]
print(yy.shape)
X.loc[X[a].notna()&X[b].isna(), b] = yy.ravel()
%%capture
cols = sorted(X.columns, key=lambda x: X[x].count(), reverse=True)
for c in [x for x in sorted(X.columns, key=lambda x: X[x].count(), reverse=True) if x!='OGS']:
#fillna('MIX',c)
fillna(c,'OGS')
def n_to_rank(n):
if n>500:
return int(round(n))
n=int(round(n))
if n<0:
return f'{abs(n)}k'
else:
return f'{abs(n+1)}d'
def n_to_rank_float(n):
if n>500:
return round(n,1)
n=round(n,1)
if n<0:
return f'{abs(n)}k'
else:
return f'{abs(n+1)}d'
def get_rank_tables(against='KGS', lo=-15, hi=7, prior=(10,1e6)):
meantable = pd.DataFrame()
stdtable = pd.DataFrame()
combinedtable = pd.DataFrame()
for s in X.drop(columns=[against,]).columns:
mean,ci,p = get_ranks_on_server(
lo,
hi,
against,
s,
k=lambda x: gp.kernels.RBF(1),
meanf=gp.mean_functions.Linear,
prior=prior,
sigma=1,
)
meantable.insert(loc=0, column=s, value=[f'{n_to_rank(mean[x][0])}' for x,_ in enumerate(mean)])
stdtable.insert(loc=0, column=s, value=[f'{n_to_rank(ci[0][x])} - {n_to_rank(ci[1][x])}' for x,_ in enumerate(ci[0])])
combinedtable.insert(loc=0, column=s, value=[f'{n_to_rank_float(mean[x][0])} ± {round(ci[1][x]-mean[x][0],1)}' for x,_ in enumerate(mean)])
meantable.insert(loc=0, column=against, value=[f'{n_to_rank(x)}' for x in sc.arange(lo,hi)])
stdtable.insert(loc=0, column=against, value=[f'{n_to_rank(x)}' for x in sc.arange(lo,hi)])
combinedtable.insert(loc=0, column=against, value=[f'{n_to_rank(x)}' for x in sc.arange(lo,hi)])
return meantable, stdtable, combinedtable
The following plots are mainly for visually demonstrating the imprecision of the following estimates.
%%capture --no-display
a='OGS'
mt1,st1,ct1=get_rank_tables(against=a, prior=(10,10-2))
cols = ['KGS','OGS','IGS','DGS','Tygem','Foxwq','WBaduk','GoQuest','EGF','AGA','Japan','China',]
mt1[cols].set_index(a)
st1[cols].set_index(a)
ct1[cols].set_index(a)