Fork me on GitHub

pandas 使用小技巧

Pandas 基础用法Cookbookpandas 常见用法-简书初学者使用Pandas的特征工程

以及一些乱七八糟的东西,随用随查

注:以下代码基本省略了import 的内容,需要的自己补上

dataframe 的列处理:空值、替换、整列删除

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
df_train.isnull().sum()   ## 查看各列空值情况

df_train["标签1"].fillna("normal",inplace=True) ## 处理空值

## 按 map 型替换
df_train['标签1'].replace({'钱庄账户':'target','上游钱庄账户':'up'},inplace=True)

## 获取指定行(即第 index+1 行)
tmp_df = df_train.iloc[index]

## 获取指定列的元素(去重后)
b = np.unique(da_train["Inverter"])

## 按序号删列
x=[4,5,8,15]
df_train.drop(df_train.columns[x], axis=1, inplace=True)

## 按某列的数据过滤
df[df[col] > 0.5]:选择col列的值大于0.5的行

## 按单列排序
df_files.sort_values('columnName', inplace=True)

## 按多列排序
df = df_train.sort_values(['year', 'month','day','hour','minute','second'], ascending=[True, True,True, True,True, True])

## 删除某列元素中多余的空格、换行符等(rstrip 右,lstrip 左,strip 左右)
data['交易日期(总)'] = data['交易日期(总)'].apply(lambda x: str(x).rstrip())

## 多列求和,成为新列
1. df["Test1"]= df['col1']+df['col2']
2. data_01["three_add"]=data_01[['pause_video_ratio','seek_video_ratio','click_forum_ratio']].apply(sum, axis=1)
3. data["x1"]=data[["a","b"]].apply(lambda x:x["a"]+x["b"], axis=1)

## 判断某一列是否有nan
data_ra.isnull().any(axis=0)

## 0 替换 nan
df['column'] = df['column'].replace(np.nan, 0) # 某列
df['column']=df['column'].fillna(value) # 某列
df.fillna(0, inplace=True) # 整张表
df.replace(np.nan, 0, inplace=True) # 整张表

## 将df2中的列添加到df1的尾部
df.concat([df1, df2],axis=1)

## 指定列元素的数据格式
data_raw[['交易金额','交易余额']] = data_raw[['交易金额','交易余额']].astype(float)

## 列名重命名
data_a.columns = ['交易账卡号','交易总天数']

## 三维数组转 DataFrame(不规整的数组不确定行不行)
df = pd.DataFrame(my_triple_list, columns=['A','B','C'])

## 文本提取:从 Item_Identifier 列(如DRC01)中取出前两个字母,组成新列 Item_Code(如 DR)
data['Item_Code'] = data['Item_Identifier'].apply(lambda x: x[0:2])

dataframe 的行处理:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
## 按序号删行
data_1.drop([0,1], inplace=True)

## 删除有nan的行
data_ra.drop(data_ra[np.isnan(data_ra['交易金额'])].index, inplace=True)

## 删除重复行
data_ra.drop_duplicates(subset=['交易账卡号','交易日期(总)','收付标志','交易金额','交易余额'],keep='first',inplace=True)

## 删除空字符串的行(删除指定的行编号)
data_ra.drop(labels=list(data_ra.loc[data_ra['交易金额'] == ''].index), inplace=True)

## jupyter notebook中显示完整的行和列
pd.set_option('max_columns',1000)
pd.set_option('max_row',300)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

## 按行遍历
for i in range(len(my_df.index)):
tmp_df = my_df.iloc[i] # 每行都是一个小 dataFrame

dataframe 的 lambda

1
2
3
4
5
## 接入整个 dataframe,然后取某列数据进行数学计算,得出的结果被 apply 添加成新列
a0["交易笔数"] = a0.apply(lambda x: 1 if x.交易余额<1000 else 0, axis=1)

## 多元 if elif:https://blog.csdn.net/weixin_39750084/article/details/103437665
df['col1'] = df['col2'].apply(lambda x:1 if x==A else (2 if x==B else (3 if x==C else 4)))

dataframe 的拼接用法(新行)

1
2
3
4
5
6
7
8
## output4拼接到 output3 下面,按字段匹配,结果:行数增加
data_ra=pd.concat([output3,output4,output5])

## 将df2中的行添加到df1的尾部
df1.append(df2)

## 字符串拼接
str = '{}{}'.format('st1', 'st2')

dataframe <—> json

1
2
3
4
5
## dataframe 跟 json 的转换
# dataframe -> json string -> json object -> string
jsonstr = df.to_json()
jsonobject = json.loads(jsonstr)
newString = json.dumps(jsonobject)

Merge

[Python3]pandas.merge用法详解

1
2
3
4
5
6
7
8
9
10
11
12
13
### 普通 Merge 的多种方式
pd.merge(df1, df2, on='key1') # key名相同
pd.merge(df1, df2, left_on='key1', right_on='key2') # key 名不同
pd.merge(df1, df2, on='key**', how='left') # 连接方式,此外还有 inner、right、outer 等
pd.merge(df1, df2, left_on='key1', right_index=True) #右表以索引作为连接键
pd.merge(df1, df2, on='key1', suffixes=('_df1','_df2')) #两表合并后,如果存在相同字段,那么会默认加上_x,_y 之类后缀,使用suffixes可以自定义该后缀
pd.merge(df1, df2, how='inner', on=['key1', 'key2']) # 多字段

### 多个 DataFrame 的 Merge
import pandas as pd
from functools import reduce
dfs = [df1, df2, df3]
df_final = reduce(lambda left,right: pd.merge(left,right,on='col',how='left'), dfs)

dataframe 的 groupby 用法

1
2
3
4
5
## groupby + agg 组合用法,diff 等需要提前 def
a1 = a0.groupby(['交易账号','交易日期']).agg({'交易账号':'first','标签1':'first','交易金额':diffAmont,'交易笔数':diff})

## 对变量Item_Identifier和Item_Type进行分组,以查看Item_Outlet_Sales均值
data['Item_Outlet_Sales_Mean'] = data.groupby(['Item_Identifier', 'Item_Type'])['Item_Outlet_Sales'].transform(lambda x: x.mean())

dataframe 处理时间

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
## 按时间格式取 year
df_train["date"] = pd.to_datetime(df_train["交易日期"],format='%Y/%m/%d')
df_train["year"] = pd.to_datetime(df_train["date"]).dt.year

## 按yyyy-mm-dd HH-MM-SS 排序
data_raw['交易日期(总)'] = pd.to_datetime(data_raw['交易日期(总)'])
data_raw = data_raw.sort_values('交易日期(总)')

## 24 小时分时段统计
data_7['交易时间段']=pd.cut(data_7['交易时刻'],[0,4,8,12,16,20,24],labels=['(0,4]','(4,8]','(8,12]','(12,16]','(16,20]','(20,24]']
,include_lowest=True)

## 将字符串转换为时间序列时间戳格式
import time
def transforms(s):
return int(time.mktime(time.strptime(s, "%Y-%m-%d %H:%M:%S")))
data_raw['交易日期(总)'] = data_raw['交易日期(总)'].astype(str)
data_raw['交易日期(总)'] = data_raw['交易日期(总)'].apply(transforms)


## 计算程序段运行时间
import time
sqlquerrystart =time.perf_counter()
#...# 程序段
sqlquerryend = time.perf_counter()
print('sqlquerry time: %s Seconds'%(sqlquerryend-sqlquerrystart))

python 读文件

done_list 是处理过的文件的路径,以下读文件时会自动跳过。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# 从路径中获取日期和文件名
def gci(filepath):
done_list = []
files_msg = []
files = os.listdir(filepath)
for fi in files:
if fi.startswith("done_list"):
f_0 = open(fi)
lines = f_0.readlines()
for line in lines:
done_list.append(line.replace('\n', ''))
for fi in files:
fi_d = os.path.join(filepath,fi)
# 同路径下的文件夹
if os.path.isdir(fi_d):
# 遍历文件
for dd in os.listdir(fi_d):
if dd.endswith(".csv"):
day = os.path.basename(fi_d)
file_in_day = os.path.join(fi_d, dd)
if file_in_day not in done_list:
tmp = []
tmp.append(dd)
tmp.append(file_in_day)
tmp.append(day)
files_msg.append(tmp)
df_files = pd.DataFrame(files_msg, columns=['fileName','filePath','fileCreateDay'])
df_files.sort_values('fileName', inplace=True)
return df_files

# 写 done_list.txt:
with open("done_list.txt", encoding="utf-8", mode="a") as file:
file.write(file_in_day)
file.write('\n')

python 删除文件

1
2
3
# 删除文件
os.remove(my_file_path)
print("File " + my_file_path +" removed successfully")

python 连接 mysql

1
2
3
4
5
6
7
8
9
from sqlalchemy import create_engine
import pymysql

password='MySQL-root@[st098]'
pwd=parse.quote_plus(password)
DB_CONNECT = f'mysql+pymysql://root:{pwd}@***:3306/test?charset=utf8'
engine = create_engine(DB_CONNECT)
conn = engine.connect()
conn.execute("delete from [" + table_name + "] where Day = '" + day + "'")

python 发送 mqtt 消息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# -*- coding: utf-8 -*-
import pandas as pd
import os
import paho.mqtt.client as mqtt

MQTTHOST = "your_ip"
MQTTPORT = 61613
mqttClient = mqtt.Client()
# 连接MQTT服务器
def on_mqtt_connect():
mqttClient.username_pw_set('admin', password='your_pwd')
mqttClient.connect(MQTTHOST, MQTTPORT, 60)
mqttClient.loop_start()
# publish 消息
def on_publish(topic, payload, qos):
mqttClient.publish(topic, payload, qos)

on_mqtt_connect()
on_publish(topic, payload, 0)

threading 多线程执行程序(也可读文件)

python多线程/进程文件读取

1
2
3
4
5
6
7
8
9
10
11
12
13
# 进程池方式
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import threading

a = [1,2,3,4,5,6,7,8,9,10,11]
b = [31,32,33,34,35,36,37,38,39,40,41]

def fn(param1, param2):
print("param1:", param1, ", param2:", param2)
thread_num = 5
with ProcessPoolExecutor(thread_num) as executor:
for index in range(len(a)):
executor.submit(fn, a[index], b[index])

特征工程&可视化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
### 查看字段的频率
pd.DataFrame(data_00['some_fea'].value_counts())

### 绘制柱状图
plt.hist(data,range=(0,80),bins=16)

### one-hot 编码,将 Outlet_Type 列独热编码后生成多列,接在 data 尾部
Outlet_Type_Dumm = pd.get_dummies(data=data['Outlet_Type'], columns=['Outlet_Type'], drop_first=True)
pd.concat([data['Outlet_Type'], Outlet_Type_Dumm], axis=1).head()

### Z-score 处理全部 feature
## 提前声明好 train_fea
# z分数标准化(全部特征)
from sklearn.preprocessing import StandardScaler
# 实例化方法
scaler = StandardScaler()
data_mean_scaled = pd.DataFrame(scaler.fit_transform(data01[train_fea]),columns=train_fea)

### 使用 cut 函数对 Item_MRP 列进行分箱,以下方法显式地提供 bin 边缘,不保证每个 bin 内观测值的分布都是相等的
#define bins
bins = [0, 70, 140, 210, 280]
#name of groups
groups = ['Low', 'Med', 'High', 'Exp']
data['Item_MRP_Bin_cut'] = pd.cut(data['Item_MRP'], bins=bins, labels=groups)

### qcut 函数是按照频率分箱,各 bin 内观测数一致,但 qcut 不常用,建议使用 cut 函数
groups = ['Low', 'Med', 'High', 'Exp']
data['Item_MRP_Bin_qcut'] = pd.qcut(data['Item_MRP'], q=4, labels=groups)

Numpy 之 axis

axis=0代表往**跨行(down),而axis=1代表跨列(across)**,作为方法动作的副词。而且Pandas保持了Numpy对关键字axis的用法

  • 使用0值表示沿着每一列或行标签\索引值向下执行方法
  • 使用1值表示沿着每一行或者列标签模向执行对应的方法

seaborn 绘图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
## 一行双图
f, (ax1,ax2) = plt.subplots(1,2, figsize=(20, 6))
sns.distplot(v14_fraud_dist,ax=ax1, fit=norm, color='#FB8861')
sns.distplot(v14_fraud_dist,ax=ax2, fit=norm, color='#FB8861')

## 一行四图
f, axes = plt.subplots(ncols=4, figsize=(20,4))
sns.boxplot(x="target", y='间隔交易时间_std', data=data, palette=None, ax=axes[0])
...

## 两行四图
f= plt.figure(figsize=(20,10))
axes = [plt.subplot2grid((2,2),(0,0)),plt.subplot2grid((2,2),(0,1)),
plt.subplot2grid((2,2),(1,0)),plt.subplot2grid((2,2),(1,1))]
sns.boxplot(x="target", y="交易金额大于1000的次数", data=data, palette=None, ax=axes[0])
...

## 查看柱状计数图
colors = ["#0101DF", "#DF0101"]
sns.countplot('target', data=data_show, palette=colors)
plt.title('Class Distributions \n (0: normal || 1: suspect)', fontsize=14)

## 查看特征间的散点相关图
cols = ['isMulti_y', 'in_out_ratio', '交易余额均值', '密集交易数据']
sns.pairplot(data01[cols], size = 2.5)
plt.show();

python+md5 加密

1
2
3
4
5
6
7
8
9
10
# 获取原始密码+salt的md5值
def create_md5(pwd,salt='128t'):
if(pwd != ''):
md5_obj = hashlib.md5()
md5_obj.update((str(pwd) + str(salt)).encode("utf-8"))
# print("pwd:",pwd,",salt:",salt,",md5:",md5_obj.hexdigest())
return md5_obj.hexdigest()
else:
return ''
data['target'] = data['target'].apply(create_md5) ## target 列原地加密

树模型查看特征重要性

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
## 以下传参中的 gbm 是训练好的模型
plt.rcParams['savefig.dpi'] = 100 #图片像素
plt.rcParams['figure.dpi'] = 100 #分辨率
#模型特征重要性
def _show_feature_import(clf,train_x):
print('------feature importance------')
feature_imp = np.zeros([train_x.shape[1]]) #df.shape[0] 行数 df.shape[1] 列数
feature_imp += clf.feature_importances_
feature_importance=pd.DataFrame({'features':train_x.columns,'importance':feature_imp})
feature_importance['importance_rate']=feature_importance['importance']/feature_importance['importance'].sum()
feature_importance=feature_importance.sort_values(by='importance',ascending=False).reset_index(drop=True)
feature_importance['cumsum_rate']=np.cumsum(feature_importance['importance_rate'])
plt.figure(figsize=(5,6))
sns.barplot(x='importance',y='features',data=feature_importance[0:10])
plt.show()
return feature_importance
#调用函数,得到特征重要性
feature_importance = _show_feature_import(gbm,x_train)

lightgbm

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
x_train, x_test, y_train, y_test = train_test_split(X_train,
Y_train,
test_size= 0.25)

params = {'num_leaves': 250,
'max_depth':7,
'min_child_weight': 0.8,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'min_data_in_leaf': 1,
'learning_rate': 0.03,
"boosting_type": "gbdt",
"bagging_seed": 11,
"metric": 'auc',
#"metric": 'map',
'reg_alpha': 1,
'reg_lambda': 1,
'random_state': 233,
'is_unbalanced':False
}
gbm = lgb.LGBMClassifier(**params,n_estimators=500)
gbm.fit(x_train,y_train.astype('int'),
eval_set=[(x_test,y_test)],
early_stopping_rounds=50,
verbose=100)
# 绘制 ROC
y_probs = gbm.predict_proba(X_test)[:,1]#模型的预测得分
fpr, tpr, thresholds = metrics.roc_curve(Y_test,y_probs)
roc_auc = auc(fpr, tpr) #auc为Roc曲线下的面积#开始画ROC曲线
plt.plot(fpr, tpr, 'b',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.xlabel('False Positive Rate') #横坐标是fpr
plt.ylabel('True Positive Rate') #纵坐标是tpr
plt.title('lightgbm模型ROC曲线')
plt.show()

随机森林+GridSearchCV

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# 将分类随机森林实例化
rd = RandomForestClassifier(random_state=1)

param_grid = {
"n_estimators":np.arange(2,30), # 决策树的数量
"max_depth": np.arange(2,20), # 最大决策深度
}

# 用这个方法可以直接对随机森林调参
algo = GridSearchCV(estimator=rd, param_grid=param_grid, cv=5, verbose=True)
algo.fit(X_train, Y_train) # 训练模型
pre_x = algo.predict(X_test) # 预测

print("训练集模型评分:",algo.score(X_train,Y_train)) # 训练集准确率
print("测试集模型评分:",algo.score(X_test,Y_test)) # 测试集准确率

sns.heatmap(confusion_matrix(Y_test, pre_x),annot=True, fmt='.10g') # 混淆矩阵
plt.xlabel('真实值')
plt.ylabel('预测值')
plt.show()

print("Classification report (validation):\n {0}".format(classification_report(pre_x, Y_test,digits =4)))

best_params_ = algo.best_params_
print("最优参数列表:{}".format(best_params_))

LR

1
2
3
4
5
6

lr = LogisticRegression() # 初始化LogisticRegression
lr.fit(X_train,Y_train) # 使用训练集对测试集进行训练
lr_y_predit=lr.predict(X_test) # 使用逻辑回归函数对测试集进行预测
print ('Accuracy of LR Classifier:%f'%lr.score(X_test,Y_test)) # 使得逻辑回归模型自带的评分函数score获得模型在测试集上的准确性结果
print (classification_report(Y_test,lr_y_predit,target_names=['Benign','Malignant']) ) #良性,恶性

SVM

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

# rbf核函数,设置数据权重
svc = SVC(kernel='rbf', class_weight='balanced',)
c_range = np.logspace(-5, 15, 11, base=2)
gamma_range = np.logspace(-9, 3, 13, base=2)
# 网格搜索交叉验证的参数范围,cv=3,3折交叉
param_grid = [{'kernel': ['rbf'], 'C': c_range, 'gamma': gamma_range}]
grid = GridSearchCV(svc, param_grid, cv=3, n_jobs=-1)
# 训练模型
clf = grid.fit(X_train, Y_train)
# 计算测试集精度
score = grid.score(X_test, Y_test)
print('精度为%s' % score)
svc_y_predit = grid.predict(X_test)
print (classification_report(Y_test,svc_y_predit,target_names=['Benign','Malignant']) ) #良性,恶性

五折 Stacking

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61

'''模型融合中使用到的各个单模型'''
lr = LogisticRegression()
svc = SVC(kernel='rbf', class_weight='balanced', probability=True)
rf = RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini')
clfs = [rf ,lr ,svc]

'''切分一部分数据作为测试集'''
X, X_predict, y, y_predict = np.array(X_train), np.array(X_test), np.array(Y_train), np.array(Y_test)


dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_predict.shape[0], len(clfs)))

'''5折stacking'''
n_folds = 5
skf = list(StratifiedKFold(y, n_folds))
for j, clf in enumerate(clfs):
'''依次训练各个单模型'''
print('=========================')
dataset_blend_test_j = np.zeros((X_predict.shape[0], len(skf)))
for i, (train, test) in enumerate(skf):
'''使用第i个部分作为预测,剩余的部分来训练模型,获得其预测的输出作为第i部分的新特征。'''
print('+++++++++++++')
X_train1, y_train1, X_test1, y_test1 = X[train], y[train], X[test], y[test]
clf.fit(X_train1, y_train1)
y_submission = clf.predict_proba(X_test1)[:, 1]
dataset_blend_train[test, j] = y_submission
dataset_blend_test_j[:, i] = clf.predict_proba(X_predict)[:, 1]
'''对于测试集,直接用这k个模型的预测值均值作为新的特征。'''
dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)

params = {'num_leaves': 250,
'max_depth':7,
'min_child_weight': 0.8,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'min_data_in_leaf': 1,
'learning_rate': 0.2,
"boosting_type": "gbdt",
"bagging_seed": 11,
"metric": 'auc',
#"metric": 'map',
'reg_alpha': 1,
'reg_lambda': 1,
'random_state': 233,
'is_unbalanced':False
}
clf = lgb.LGBMClassifier(**params,n_estimators=300)
clf.fit(dataset_blend_train,y,
eval_set=[(X_predict,y_predict)],
verbose=100)

y_submission_real = clf.predict(dataset_blend_test)
accuracy = accuracy_score(y_predict, y_submission_real)
print('模型的准确率为:',accuracy)

sns.heatmap(confusion_matrix(y_predict, y_submission_real),annot=True, fmt='.10g') # 混淆矩阵
plt.show()
print("Classification report (validation):\n {0}".format(classification_report(y_submission_real,y_predict,digits =4)))
print("val auc Score: %f" % roc_auc_score(y_predict, y_submission_real))

OPTICS _优化的 DBScan 聚类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94

# 显示决策图
def plotReachability(data,eps):
plt.figure()
plt.plot(range(0,len(data)), data)
plt.plot([0, len(data)], [eps, eps])
plt.show()

# 显示分类的类别
def plotFeature(data,labels):
clusterNum = len(set(labels))
fig = plt.figure()
scatterColors = ['black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown']
ax = fig.add_subplot(111)
for i in range(-1, clusterNum):
colorSytle = scatterColors[i % len(scatterColors)]
subCluster = data[np.where(labels == i)]
ax.scatter(subCluster[:, 0], subCluster[:, 1], c=colorSytle, s=12)
plt.show()

class OPTICS1(object):
def __init__(self,data,eps=np.inf,minPts=15):
self.data=data
self.disMat = self.compute_squared_EDM(data) # 获得距离矩阵

self.number_sample=data.shape[0]
self.eps=eps
self.minPts=minPts
self.core_distances = self.disMat[np.arange(0, self.number_sample), np.argsort(self.disMat)[:, minPts - 1]] # 计算核心距离
self.core_points_index = np.where(np.sum(np.where(self.disMat <= self.eps, 1, 0), axis=1) >= self.minPts)[0]

# 计算距离矩阵
def compute_squared_EDM(self,X):
return squareform(pdist(X, metric='euclidean'))
#训练
def train(self):
# 初始化每一个点的最终可达距离(未定义)
self.reach_dists = np.full((self.number_sample,), np.nan)
self.orders=[]#结果数组
start_core_point=self.core_points_index[0]#从一个核心点开始
#标记数组
isProcess = np.full((self.number_sample,), -1)

#训练
isProcess[start_core_point] = 1
#选择一个核心点作为开始节点,并将其核心距离作为可达距离
self.reach_dists[start_core_point] = self.core_distances[start_core_point]
self.orders.append(start_core_point) # 加入结果数组
seeds = {}#种子数组,或者叫排序数组
seeds = self.updateSeeds(seeds, start_core_point, isProcess)#更新排序数组
while len(seeds) > 0:
nextId = sorted(seeds.items(), key=operator.itemgetter(1))[0][0] # 按可达距离排序,取第一个(最小的)
del seeds[nextId]
isProcess[nextId] = 1
self.orders.append(nextId) # 加入结果数组
seeds = self.updateSeeds(seeds, nextId, isProcess)#更新种子数组和可达距离数组的可达距离

#更新可达距离
def updateSeeds(self,seeds, core_PointId, isProcess):
# 获得核心点core_PointId的核心距离
core_dist = self.core_distances[core_PointId]
# 计算所未访问的样本点更新可达距离
for i in range(self.number_sample):
if (isProcess[i] == -1):
# 计算可达距离
new_reach_dist = max(core_dist, self.disMat[core_PointId][i])
if (np.isnan(self.reach_dists[i])):
# 可达矩阵更新
self.reach_dists[i] = new_reach_dist
seeds[i] = new_reach_dist
elif (new_reach_dist < self.reach_dists[i]):
self.reach_dists[i] = new_reach_dist
seeds[i] = new_reach_dist
return seeds
#生成label
def predict(self):
clusterId = 0
self.labels = np.full((self.number_sample,), -1)
for i in self.orders:
if self.reach_dists[i]<=self.eps:
self.labels[i]=clusterId
else:
if self.core_distances[i]<=self.eps:
clusterId +=1
self.labels[i] = clusterId

if __name__ == '__main__':
data = X
OP=OPTICS1(data,3,15)
OP.train()
OP.predict()
plotReachability(OP.reach_dists[OP.orders], 1.7)
plotFeature(np.array(data),OP.labels)
le = pd.Series(OP.labels)
-------------The End-------------