Allenfenqu/partition_main_0814_kmeans(...

189 lines
7.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np
Initial_partitions=60
# 加载数据
df = pd.read_csv('links_processed.csv', usecols=[0, 1, 2, 3, 4])
df.columns = ['start_lat', 'start_long', 'end_lat', 'end_long', 'speed']
# 计算路段的中心点
df['center_lat'] = ((df['start_lat'] + df['end_lat']) / 2).round(7)
df['center_long'] = ((df['start_long'] + df['end_long']) / 2).round(7)
# 提取用于聚类的特征
features = df[['center_lat', 'center_long']]
# 数据标准化
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
# 运行KMeans算法
kmeans = KMeans(n_clusters=Initial_partitions, n_init=10) # 假设我们想要划分为40个区域
kmeans.fit(scaled_features)
# 将聚类结果添加到原始数据中
df['cluster'] = kmeans.labels_
df['cluster'] = df['cluster'] + 1
df=df.to_numpy()
links = pd.read_csv('links_processed.csv')
links = links.to_numpy()
node = np.concatenate((links[:, :2], links[:, 2:4]), axis=0) # np.concatenate 函数会将这两个子数组沿着轴 0 连接起来;
# axis 是指在数组操作时沿着哪个轴进行操作。当axis=0时表示在第一个维度上进行拼接操作。这里就是纵轴
# 这里是给道路起点和终点标注序列,也就是路口表注序列,因为一个路口可以是好几个道路的起点或终点,所以同一路口就会有同样的标记
noi = 1
node = np.hstack((node, np.zeros((len(node), 1))))
for i in range(node.shape[0]): # node.shape[0] 是指 node 数组的第一维大小,即 node 数组的行数
a = np.where(node[:i, 0] == node[i, 0])[0]
b = np.where(node[:i, 1] == node[i, 1])[0]
c = np.intersect1d(a, b) # intersect1d 返回两个数组的交集
if c.size > 0:
x = c.shape[0]
y = 1
else:
x, y = 0, 1
# 在 node 数组的最后添加一列全为0的列并将添加后的新数组重新赋值给 node
if x > 0 and y > 0:
node[i, 2] = node[min(c), 2] # 如果c是矩阵则min(A)是包含每一列的最小值的行向量
else:
node[i, 2] = noi
noi += 1
node = np.concatenate((node[:int(len(node) / 2), 2].reshape(-1, 1), node[int(len(node) / 2):, 2].reshape(-1, 1)),
axis=1)
# 这里的links多加了一行才能yanlinks但这样yanlinks就不对了
links = np.hstack((links, np.zeros((len(links), 1))))
links = np.hstack((links, np.zeros((len(links), 1))))
links = np.hstack((links, np.zeros((len(links), 1))))
yanlinks = np.concatenate((node, links[:, [5, 6, 7, 4, 0, 1, 2, 3]], np.zeros((len(links), 4))), axis=1)
yanlinks[:, 4] = np.arange(1, len(yanlinks) + 1)
road = np.arange(1, node.shape[0] + 1)
adjacency = np.zeros((len(road), len(road)))
# 初始化分区
for i in range(len(road)):
temp1 = np.where(node[:, 0] == node[i, 0])[0] # 找出第一列每个数字在第一列出现的位置
temp2 = np.where(node[:, 1] == node[i, 0])[0] # 找出第一列每个数字在第二列出现的位置
temp3 = np.where(node[:, 0] == node[i, 1])[0] # 找出第二列每个数字在第一列出现的位置
temp4 = np.where(node[:, 1] == node[i, 1])[0] # 找出第二列每个数字在第二列出现的位置
temp = np.unique(np.intersect1d(np.arange(i + 1, node.shape[0]), np.concatenate((temp1, temp2, temp3, temp4))))
if len(temp) > 0:
adjacency[i, temp] = 1
adjacency[temp, i] = 1
row_sums = np.sum(adjacency, axis=1)
# 找到全零行的索引
zero_row_indices = np.where(row_sums == 0)[0]
yanlinks[:, 3] = links[:, 9]
yanlinks[:, 10] = df[:, 7]
yanlinks = yanlinks[yanlinks[:, 10] != 0]
yanlinks = yanlinks[yanlinks[:, 10] != -1, :]
road = np.unique(np.concatenate((yanlinks[:, 1], yanlinks[:, 0]), axis=0))
adjacency = np.zeros((len(road), len(road)))
adregion = np.zeros((int(np.max(yanlinks[:, 4])), int(np.max(yanlinks[:, 4]))))
for i in range(len(yanlinks[:, 0])):
temp1 = np.where(node[:, 0] == node[i, 0])[0]
temp2 = np.where(node[:, 1] == node[i, 0])[0]
temp3 = np.where(node[:, 0] == node[i, 1])[0]
temp4 = np.where(node[:, 1] == node[i, 1])[0]
temp = np.unique(np.intersect1d(np.arange(i + 1, node.shape[0]), np.concatenate((temp1, temp2, temp3, temp4))))
if len(temp) > 0:
adregion[i, temp] = 1
adregion[temp, i] = 1
# adregion矩阵表示路段之间的邻接关系
np.save('adregion.npy', adregion)
# 给adregion矩阵乘上权重道路的分组编号
for i in range(len(yanlinks[:, 1])):
# print(adregion[:, int(yanlinks[i, 4])])
# print(int(yanlinks[i, 10]))
adregion[:, int(yanlinks[i, 4]) - 1] = adregion[:, int(yanlinks[i, 4]) - 1] * int(yanlinks[i, 10])
subregion_adj = np.zeros((Initial_partitions, Initial_partitions))
# 计算adregion中的每个元素出现的频率(判断是强相关还是弱相关)
for i in range(len(adregion[:, 1])):
a = adregion[i, :]
a = np.unique(a)
a = a[a != 0]
if a.size > 0:
x = 1
y = a.shape[0]
else:
x, y = 0, 1
if y > 1:
for j in range(len(a)):
for u in range(len(a)):
if j != u:
# subregion_adj表示子区域的邻接关系其中数值的大小表示区域之间的相关程度
subregion_adj[int(a[j]) - 1, int(a[u]) - 1] += 1
subregion_adj[int(a[u]) - 1, int(a[j]) - 1] += 1
# 计算后存到directed_adjacency_matrix里
directed_adjacency_matrix = subregion_adj.copy()
# 对于子区域相关程度处于弱相关的邻接关系进行忽略
min_value = np.min(np.max(subregion_adj, axis=0)) - 2
subregion_adj[subregion_adj < min_value] = 0
subregion_adj[subregion_adj > 1] = 1
directed_adjacency_matrix[directed_adjacency_matrix > 1] = 1
unique_values, unique_indices = np.unique(yanlinks[:, 10], return_index=True)
Asb = 0 # 计算平均相似性
for i in unique_values:
wu = np.where(subregion_adj[int(i) - 1, :] == 1) # wu是元组
smrjj_divide_smrjj_ = 0
# 0726
wu_1 = wu[0]
for j in wu_1:
selected_values_list = [yanlinks[yanlinks[:, 10] == j + 1][:, 5]]
# 主区域邻接的一个区域速度均值与方差
selected_values = np.array(selected_values_list)
average = np.mean(selected_values)
variance = np.var(selected_values)
# 计算主区域的速度均值与方差
selected_values1 = yanlinks[yanlinks[:, 10] == i][:, 5]
average1 = np.mean(selected_values1)
variance1 = np.var(selected_values1)
smrjj = 2 * variance1 # jj情况下的smrjj
smrjj_ = variance + variance1 + (average - average1) ** 2
smrjj_divide_smrjj_one = smrjj / smrjj_
smrjj_divide_smrjj_ += smrjj_divide_smrjj_one
num_elements = len(wu[0]) # 计算分母NE
Asb_one = smrjj_divide_smrjj_ / num_elements
Asb += Asb_one
Asb=Asb/Initial_partitions
print('Asb=', Asb)
Tvb = 0
for i in unique_values:
selected_values = yanlinks[yanlinks[:, 10] == i][:, 5]
variance = np.var(selected_values)
Tvb += variance
print('Tvb=', Tvb)
# np.save('subregion_adj.npy', subregion_adj)
# np.save('yanlinks.npy', yanlinks)