用python实现模糊查找
时间: 2023-07-20 09:07:47 浏览: 141
模糊查找是指在字符串的集合中,查找与给定的字符串相似的字符串。Python提供了re模块来处理正则表达式,可以使用正则表达式来实现模糊查找。
以下是一个简单的代码示例:
```python
import re
def fuzzy_search(string, pattern):
regex = '.*?'.join(map(re.escape, pattern))
match = re.search(regex, string, re.IGNORECASE)
return match.group(0) if match else None
# 示例用法
string = 'hello world'
pattern = 'hlo'
result = fuzzy_search(string, pattern)
print(result) # 输出 'hello'
```
该函数接受两个参数:`string`和`pattern`。`string`是要搜索的字符串,`pattern`是要查找的模式。它使用`re.escape`函数来转义模式中的特殊字符,并将它们连接起来,形成一个正则表达式。然后,它使用`re.search`函数在字符串中查找匹配项。如果找到了匹配项,则返回它,否则返回`None`。在上面的示例中,模式`'hlo'`与字符串`'hello world'`中的`'hello'`匹配。
需要注意的是,模糊查找可能会返回多个匹配项。在这种情况下,可以使用`re.findall`函数来查找所有匹配项。
相关问题
python模糊查找
以下是一个使用Python实现模糊查询的例子:
```python
import re
def fuzzy_query(keyword, data_list):
result = []
pattern = re.compile(keyword, re.IGNORECASE)
for item in data_list:
if re.search(pattern, item):
result.append(item)
return result
# 示例数据
data_list = ['apple', 'banana', 'orange', 'grape', 'watermelon']
# 模糊查询关键词为 'an'
result = fuzzy_query('an', data_list)
print(result) # 输出:['banana', 'orange']
# 模糊查询关键词为 'e'
result = fuzzy_query('e', data_list)
print(result) # 输出:['apple', 'grape', 'watermelon']
```
这个例子中,我们首先导入了Python的re模块,它提供了正则表达式的支持。然后,我们定义了一个名为fuzzy_query的函数,该函数接受两个参数:关键词和待查询的数据列表。在函数内部,我们使用re.compile函数创建了一个正则表达式模式,并使用re.search函数在每个数据项上进行模糊匹配。如果匹配成功,则将该数据项添加到结果列表中。最后,我们返回结果列表。
请用python实现模糊决策树
以下是Python实现模糊决策树的示例代码:
```python
import numpy as np
# 定义模糊集合的隶属度函数
def triangular_membership(x, a, b, c):
if x <= a or x >= c:
return 0
elif a < x <= b:
return (x - a) / (b - a)
else:
return (c - x) / (c - b)
# 定义节点类
class Node:
def __init__(self, feature_index=None, threshold=None, operator=None, left=None, right=None, label=None):
self.feature_index = feature_index # 节点划分特征的索引
self.threshold = threshold # 节点划分阈值
self.operator = operator # 节点划分运算符
self.left = left # 左子节点
self.right = right # 右子节点
self.label = label # 叶子节点标签
# 定义模糊决策树类
class FuzzyDecisionTree:
def __init__(self, max_depth=5, min_samples_split=2, min_impurity_decrease=0):
self.max_depth = max_depth # 最大深度
self.min_samples_split = min_samples_split # 最小样本分割数
self.min_impurity_decrease = min_impurity_decrease # 最小不纯度减少量
self.tree = None # 模糊决策树
# 计算基尼指数(用于节点不纯度的衡量)
def _gini_index(self, y):
classes = np.unique(y)
n_samples = len(y)
gini = 1
for c in classes:
p_c = len(y[y == c]) / n_samples
gini -= p_c ** 2
return gini
# 计算信息熵(用于节点不纯度的衡量)
def _entropy(self, y):
classes = np.unique(y)
n_samples = len(y)
entropy = 0
for c in classes:
p_c = len(y[y == c]) / n_samples
entropy -= p_c * np.log2(p_c)
return entropy
# 寻找最优划分特征和阈值
def _find_best_split(self, X, y):
n_features = X.shape[1]
best_feature_index, best_threshold, best_operator, best_impurity = None, None, None, 1
for feature_index in range(n_features):
fuzzy_set1 = {'low': [], 'medium': [], 'high': []}
fuzzy_set2 = {'low': [], 'medium': [], 'high': []}
for i in range(len(X)):
if X[i][feature_index] <= np.percentile(X[:, feature_index], 33):
fuzzy_set1['low'].append(y[i])
elif X[i][feature_index] >= np.percentile(X[:, feature_index], 67):
fuzzy_set1['high'].append(y[i])
else:
fuzzy_set1['medium'].append(y[i])
for i in range(len(X)):
fuzzy_set2['low'].append(triangular_membership(X[i][feature_index], np.percentile(X[:, feature_index], 0), np.percentile(X[:, feature_index], 33), np.percentile(X[:, feature_index], 67)))
fuzzy_set2['medium'].append(triangular_membership(X[i][feature_index], np.percentile(X[:, feature_index], 33), np.percentile(X[:, feature_index], 67), np.percentile(X[:, feature_index], 100)))
fuzzy_set2['high'].append(triangular_membership(X[i][feature_index], np.percentile(X[:, feature_index], 67), np.percentile(X[:, feature_index], 100), np.percentile(X[:, feature_index], 101)))
impurity = 0
for operator in ('<=', '>'):
if operator == '<=':
fuzzy_set = fuzzy_set1
else:
fuzzy_set = fuzzy_set2
if len(fuzzy_set['low']) > 0 and len(fuzzy_set['medium']) > 0:
impurity += fuzzy_set['low'].count(0) + fuzzy_set['medium'].count(1) - (fuzzy_set['low'].count(0) / len(fuzzy_set['low'])) ** 2 - (fuzzy_set['medium'].count(1) / len(fuzzy_set['medium'])) ** 2
if len(fuzzy_set['medium']) > 0 and len(fuzzy_set['high']) > 0:
impurity += fuzzy_set['medium'].count(0) + fuzzy_set['high'].count(1) - (fuzzy_set['medium'].count(0) / len(fuzzy_set['medium'])) ** 2 - (fuzzy_set['high'].count(1) / len(fuzzy_set['high'])) ** 2
if impurity < best_impurity:
best_feature_index, best_threshold, best_operator, best_impurity = feature_index, np.percentile(X[:, feature_index], 50), '<=', impurity
for operator in ('>',):
if operator == '<=':
fuzzy_set = fuzzy_set1
else:
fuzzy_set = fuzzy_set2
if len(fuzzy_set['low']) > 0 and len(fuzzy_set['medium']) > 0:
impurity += fuzzy_set['low'].count(1) + fuzzy_set['medium'].count(0) - (fuzzy_set['low'].count(1) / len(fuzzy_set['low'])) ** 2 - (fuzzy_set['medium'].count(0) / len(fuzzy_set['medium'])) ** 2
if len(fuzzy_set['medium']) > 0 and len(fuzzy_set['high']) > 0:
impurity += fuzzy_set['medium'].count(1) + fuzzy_set['high'].count(0) - (fuzzy_set['medium'].count(1) / len(fuzzy_set['medium'])) ** 2 - (fuzzy_set['high'].count(0) / len(fuzzy_set['high'])) ** 2
if impurity < best_impurity:
best_feature_index, best_threshold, best_operator, best_impurity = feature_index, np.percentile(X[:, feature_index], 50), '>', impurity
return best_feature_index, best_threshold, best_operator, best_impurity
# 创建模糊决策树
def _create_tree(self, X, y, depth):
n_samples, n_features = X.shape
# 如果样本数小于最小样本分割数或树的深度大于等于最大深度,返回叶子节点,并以样本标签最多的类别作为标签
if n_samples < self.min_samples_split or depth >= self.max_depth:
return Node(label=np.argmax(np.bincount(y)))
# 计算节点的不纯度
impurity = self._gini_index(y) if self.min_impurity_decrease <= 0 else self._entropy(y)
# 如果节点的不纯度小于等于最小不纯度减少量,返回叶子节点,并以样本标签最多的类别作为标签
if impurity <= self.min_impurity_decrease:
return Node(label=np.argmax(np.bincount(y)))
# 寻找最优划分特征和阈值
feature_index, threshold, operator, impurity = self._find_best_split(X, y)
# 如果找不到合适的划分特征和阈值,返回叶子节点,并以样本标签最多的类别作为标签
if feature_index is None or threshold is None:
return Node(label=np.argmax(np.bincount(y)))
# 根据划分特征和阈值拆分数据集并递归生成子树
if operator == '<=':
left_indices = [i for i in range(n_samples) if triangular_membership(X[i][feature_index], np.percentile(X[:, feature_index], 0), np.percentile(X[:, feature_index], 33), np.percentile(X[:, feature_index], 67)) > threshold]
right_indices = [i for i in range(n_samples) if triangular_membership(X[i][feature_index], np.percentile(X[:, feature_index], 0), np.percentile(X[:, feature_index], 33), np.percentile(X[:, feature_index], 67)) <= threshold]
else:
left_indices = [i for i in range(n_samples) if triangular_membership(X[i][feature_index], np.percentile(X[:, feature_index], 67), np.percentile(X[:, feature_index], 100), np.percentile(X[:, feature_index], 101)) > threshold]
right_indices = [i for i in range(n_samples) if triangular_membership(X[i][feature_index], np.percentile(X[:, feature_index], 67), np.percentile(X[:, feature_index], 100), np.percentile(X[:, feature_index], 101)) <= threshold]
left = self._create_tree(X[left_indices], y[left_indices], depth + 1)
right = self._create_tree(X[right_indices], y[right_indices], depth + 1)
return Node(feature_index=feature_index, threshold=threshold, operator=operator, left=left, right=right)
# 拟合模型
def fit(self, X, y):
self.tree = self._create_tree(X, y, 0)
# 预测样本标签
def predict(self, X):
n_samples = X.shape[0]
y_pred = []
for i in range(n_samples):
node = self.tree
while node.label is None:
if node.operator == '<=':
if triangular_membership(X[i][node.feature_index], np.percentile(X[:, node.feature_index], 0), np.percentile(X[:, node.feature_index], 33), np.percentile(X[:, node.feature_index], 67)) > node.threshold:
node = node.left
else:
node = node.right
else:
if triangular_membership(X[i][node.feature_index], np.percentile(X[:, node.feature_index], 67), np.percentile(X[:, node.feature_index], 100), np.percentile(X[:, node.feature_index], 101)) > node.threshold:
node = node.left
else:
node = node.right
y_pred.append(node.label)
return y_pred
```
需要注意的是,由于模糊决策树的划分方式不同于传统决策树,因此需要重新定义节点类和创建模糊决策树的方法。在示例代码中,我们使用三角隶属度函数来定义模糊集合的隶属度函数,并使用三等分法将特征值划分成三个模糊集合。在寻找最优划分特征和阈值时,我们需要对每个特征的每个模糊集合计算不纯度,并选择不纯度最小的划分方式作为最优划分方式。在拆分数据集时,我们需要根据划分特征和阈值将每个样本划分到相应的模糊集合中,并根据模糊集合的隶属度计算每个样本在左子节点和右子节点中的隶属度,以此来决定每个样本应该属于哪个子节点。在预测样本标签时,我们需要在树中递归查找每个样本所属的叶子节点,并将该节点的标签作为预测结果。
阅读全文