使用 DBSCAN 同时聚类离散和连续特征 - 确认设置正确?
Simultaneously clustering discrete and continuous features with DBSCAN - confirmation this is setup correctly?
这是我第一次尝试使用 DBSCAN 对我从网页中提取的文本内容块的离散特征(数据点的边界宽度)和连续特征(计算的 css 和数据点的路径)进行聚类。
我有 7 个样本(在第一个数据集中),所以当我将 DBSCAN min_samples 设置为 1 时,这个输出是我所期望的:
- 估计的簇数:7
- 预计噪声点数:0
- 同质性:1.000
- 完整性:1.000
然后我尝试绘制星团以便将它们可视化。对于绘图,我使用了 sklearn example,根据我的数据对其进行了调整。但是,生成的图看起来不太正确。
看起来大多数簇的y轴值都是相同的(-0.408)。我相信这取决于在这一步使用 StandardScaler():
feature_stack = np.hstack([continuous_features, discrete_features])
"""[[-1.31614507 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. ]
[-0.66130166 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. ]"""
features = feature_stack.astype(np.float32)
"""[[-1.3161451 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. ]
[-0.6613017 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. ]"""
# CLUSTER DATA
scaled_data = StandardScaler().fit_transform(features)
# scaled_data
# [[-1.3161452 -0.4082483 0.40824828 2.4494898 2.4494898 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 1.581139 1.581139 -0.4082483 1.1547004 -0.4082483 1.581139 -0.4082483 -0.4082483 -0.6324555 1.581139 -0.4082483 -0.4082483 -0.6324555 -0.4082483 2.4494898 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.6324556 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -1.1547006 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 1.581139 -0.4082483 ]
# [-0.66130173 -0.4082483 0.40824828 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.6324556 1.581139 -0.4082483 1.1547004 -0.4082483 1.581139 -0.4082483 -0.4082483 -0.6324555 1.581139 -0.4082483 -0.4082483 -0.6324555 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.6324556 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -1.1547006 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.6324556 -0.4082483 ]
我可以做些什么来改进我的模型?
这是我获得上图的完整代码(包括注释):
# -*- coding: utf-8 -*-
# Main
import os
import simplejson as json
import random
import processors
import tokenizers
import analyzers
import clusterers
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn import svm, preprocessing, cross_validation
from sklearn.metrics import precision_recall_curve, auc, classification_report, precision_recall_fscore_support
import collections
# Processor
from sklearn import preprocessing
# DBSCAN
from sklearn import cluster
from sklearn.preprocessing import StandardScaler
import numpy as np
class Processor(object):
CONTINUOUS_FEATURES = {
'width': lambda page, datapoint: float(datapoint['bound']['width']),}
def __init__(self, data):
self.data = data
self.pages = []
self.texts = []
for page in self.data:
for text in page['texts']:
self.pages.append(page)
self.texts.append(text)
def extract(self):
continuous_features = []
discrete_features = []
for page, text in zip(self.pages, self.texts):
continuous_features.append([process(page, text) for key, process in self.CONTINUOUS_FEATURES.iteritems()])
discrete_feature = dict(text['computed'].items())
discrete_feature['path'] = ' > '.join(text['path'])
discrete_features.append(discrete_feature)
return continuous_features, discrete_features
def load_data(file):
with open(file) as f:
data = json.load(f)
return data
def main():
data = [{'body': {'scroll': {'top': 0, 'left': 0}, 'bound': {'width': 3983, 'top': 0, 'height': 1526, 'left': 0}}, 'texts': [{'computed': {'font-size': '15px', 'text-decoration-color': 'rgb(0, 0, 0)', 'color': 'rgb(0, 0, 0)', 'transform-origin': '15px 13px', 'margin-right': '10px', 'border-left-color': 'rgb(0, 0, 0)', 'background-repeat': 'no-repeat', 'caret-color': 'rgb(0, 0, 0)', 'border-top-color': 'rgb(0, 0, 0)', 'background-color': 'rgba(0, 0, 0, 0)', 'border-bottom-color': 'rgb(0, 0, 0)', 'outline-color': 'rgb(0, 0, 0)', 'border-right-color': 'rgb(0, 0, 0)', 'text-emphasis-color': 'rgb(0, 0, 0)', 'text-indent': '-9999px', 'unicode-bidi': 'normal', 'text-shadow': 'rgb(0, 0, 0) 0px 0px 0px', 'font-family': 'FuturaLight', 'background-image': 'url("file:///C:/Users/ronaldg/Documents/_Beauty/data/sites/adorebeauty/images/head/heart-icon.svg")', 'perspective-origin': '15px 13px', 'line-height': '20.25px', 'cursor': 'pointer', 'display': 'inline-block', 'column-rule-color': 'rgb(0, 0, 0)'}, 'text': ['Wishlist'], 'bound': {'width': 30, 'top': 30, 'height': 26, 'left': 2305.60009765625}, 'selector': [{'classes': ['wrapper'], 'id': '', 'name': 'div'}, {'classes': ['page'], 'id': '', 'name': 'div'}, {'classes': ['mage-header'], 'id': '', 'name': 'div'}, {'classes': [], 'id': 'header', 'name': 'header'}, {'classes': ['header-section'], 'id': '', 'name': 'div'}, {'classes': ['header-right-block'], 'id': '', 'name': 'div'}, {'classes': ['header-account'], 'id': 'header-account', 'name': 'div'}, {'classes': ['header-wishlist'], 'id': '', 'name': 'a'}], 'html': 'Wishlist', 'path': ['div', 'div', 'div', 'header', 'div', 'div', 'div', 'a'], 'element': {'classes': ['header-wishlist'], 'id': '', 'name': 'a'}}, {'computed': {'font-size': '15px', 'perspective-origin': '72.7px 15px', 'transform-origin': '72.7px 15px', 'display': 'inline-block', 'padding-top': '5px', 'font-family': 'FuturaLight', 'line-height': '20.25px', 'background-color': 'rgba(0, 0, 0, 0)'}, 'text': ['Sign in', ' | ', 'Register'], 'bound': {'width': 145.39999389648438, 'top': 25, 'height': 30, 'left': 2303.60009765625}, 'selector': [{'classes': ['wrapper'], 'id': '', 'name': 'div'}, {'classes': ['page'], 'id': '', 'name': 'div'}, {'classes': ['mage-header'], 'id': '', 'name': 'div'}, {'classes': [], 'id': 'header', 'name': 'header'}, {'classes': ['header-section'], 'id': '', 'name': 'div'}, {'classes': ['header-right-block'], 'id': '', 'name': 'div'}, {'classes': ['header-account'], 'id': 'header-account', 'name': 'div'}], 'html': '\n <!-- -->\n <a href="https://www.adorebeauty.com.au/wishlist/" rel="nofollow" class="header-wishlist" style="border: 1px solid red;">Wishlist</a><a href="https://www.adorebeauty.com.au/customer/account/login/" rel="nofollow" class="login">Sign in</a> | <a href="https://www.adorebeauty.com.au/customer/account/create/" rel="nofollow">Register</a>', 'path': ['div', 'div', 'div', 'header', 'div', 'div', 'div'], 'element': {'classes': ['header-account'], 'id': 'header-account', 'name': 'div'}}, {'computed': {'border-top-style': 'solid', 'font-size': '14px', 'text-decoration-color': 'rgb(255, 255, 255)', 'color': 'rgb(255, 255, 255)', 'letter-spacing': '1px', 'transform-origin': '95.0833px 22.5px', 'padding-bottom': '12px', 'padding-top': '12px', 'border-top-width': '1px', 'border-left-color': 'rgba(0, 0, 0, 0)', 'border-right-style': 'solid', 'padding-right': '18px', 'border-left-style': 'solid', 'caret-color': 'rgb(255, 255, 255)', 'border-top-color': 'rgba(0, 0, 0, 0)', 'background-color': 'rgba(0, 0, 0, 0)', 'border-bottom-color': 'rgb(255, 255, 255)', 'outline-color': 'rgb(255, 255, 255)', 'border-right-color': 'rgba(0, 0, 0, 0)', 'text-emphasis-color': 'rgb(255, 255, 255)', 'unicode-bidi': 'normal', 'text-shadow': 'rgb(255, 255, 255) 0px 0px 0px', 'list-style-type': 'none', 'font-family': 'FuturaLight', 'text-align': 'left', 'perspective-origin': '95.0833px 22.5px', 'cursor': 'pointer', 'border-right-width': '1px', 'column-rule-color': 'rgb(255, 255, 255)', 'text-transform': 'uppercase', 'line-height': '20px', 'border-left-width': '1px', 'padding-left': '18px'}, 'text': ['Shop By Category'], 'bound': {'width': 190.1666717529297, 'top': 80, 'height': 45, 'left': 1499}, 'selector': [{'classes': ['wrapper'], 'id': '', 'name': 'div'}, {'classes': ['page'], 'id': '', 'name': 'div'}, {'classes': ['nav-head'], 'id': '', 'name': 'nav'}, {'classes': ['top-nav'], 'id': 'top-nav', 'name': 'ul'}, {'classes': ['cat-item', 'top'], 'id': '', 'name': 'li'}, {'classes': [], 'id': '', 'name': 'a'}], 'html': 'Shop By Category', 'path': ['div', 'div', 'nav', 'ul', 'li', 'a'], 'element': {'classes': [], 'id': '', 'name': 'a'}}, {'computed': {'font-size': '16px', 'text-decoration-color': 'rgb(20, 179, 88)', 'color': 'rgb(20, 179, 88)', 'transform-origin': '270px 25.5333px', 'padding-bottom': '10px', 'padding-top': '10px', 'border-left-color': 'rgb(20, 179, 88)', 'margin-bottom': '28px', 'padding-right': '10px', 'caret-color': 'rgb(20, 179, 88)', 'border-top-color': 'rgb(20, 179, 88)', 'background-color': 'rgb(234, 248, 248)', 'border-bottom-color': 'rgb(20, 179, 88)', 'outline-color': 'rgb(20, 179, 88)', 'border-right-color': 'rgb(20, 179, 88)', 'text-emphasis-color': 'rgb(20, 179, 88)', 'text-shadow': 'rgb(20, 179, 88) 0px 0px 0px', 'perspective-origin': '270px 25.5333px', 'margin-top': '22px', 'line-height': '21.6px', 'column-rule-color': 'rgb(20, 179, 88)', 'padding-left': '10px'}, 'text': [u'\u2714\ufe0e ', 'In Stock.', '\n We ship today if you order before ', '3 am'], 'bound': {'width': 540, 'top': 479.9666748046875, 'height': 51.05000305175781, 'left': 1921.5}, 'selector': [{'classes': ['wrapper'], 'id': '', 'name': 'div'}, {'classes': ['page'], 'id': '', 'name': 'div'}, {'classes': ['col1-layout', 'main'], 'id': '', 'name': 'div'}, {'classes': ['col-main'], 'id': '', 'name': 'div'}, {'classes': [], 'id': '', 'name': 'div'}, {'classes': ['product-view'], 'id': '', 'name': 'div'}, {'classes': [], 'id': 'product_addtocart_form', 'name': 'form'}, {'classes': ['product-essential'], 'id': '', 'name': 'div'}, {'classes': ['product-shop'], 'id': 'product-shop', 'name': 'div'}, {'classes': ['add-to-box'], 'id': '', 'name': 'div'}, {'classes': ['is-before', 'new-in-stock'], 'id': '', 'name': 'div'}], 'html': u'\n <span><span class="tick">\u2714\ufe0e </span>In Stock.</span>\n We ship today if you order before <span class="time" data-time="1539262800000">3 am</span> ', 'path': ['div', 'div', 'div', 'div', 'div', 'div', 'form', 'div', 'div', 'div', 'div'], 'element': {'classes': ['is-before', 'new-in-stock'], 'id': '', 'name': 'div'}}, {'computed': {'float': 'left', 'transform-origin': '135px 18.5833px', 'perspective-origin': '135px 18.5833px', 'background-color': 'rgba(0, 0, 0, 0)', 'text-align': 'left'}, 'text': ['Qty'], 'bound': {'width': 270, 'top': 561.0166625976562, 'height': 37.15000915527344, 'left': 1921.5}, 'selector': [{'classes': ['wrapper'], 'id': '', 'name': 'div'}, {'classes': ['page'], 'id': '', 'name': 'div'}, {'classes': ['col1-layout', 'main'], 'id': '', 'name': 'div'}, {'classes': ['col-main'], 'id': '', 'name': 'div'}, {'classes': [], 'id': '', 'name': 'div'}, {'classes': ['product-view'], 'id': '', 'name': 'div'}, {'classes': [], 'id': 'product_addtocart_form', 'name': 'form'}, {'classes': ['product-essential'], 'id': '', 'name': 'div'}, {'classes': ['product-shop'], 'id': 'product-shop', 'name': 'div'}, {'classes': ['add-to-box'], 'id': '', 'name': 'div'}, {'classes': ['add-to-cart'], 'id': '', 'name': 'div'}], 'html': '\n\t<label for="qty">Qty</label>\n\t<select name="qty" id="qty" class="hasCustomSelect" style="-webkit-appearance: menulist-button; width: 60px; position: absolute; opacity: 0; height: 36px; font-size: 11px; left: 0px;">\n\t\t<option value="1" selected="">1</option>\n \t\t<option value="2">2</option>\n \t\t<option value="3">3</option>\n \t\t<option value="4">4</option>\n \t\t<option value="5">5</option>\n \t\t<option value="6">6</option>\n \t\t<option value="7">7</option>\n \t\t<option value="8">8</option>\n \t\t<option value="9">9</option>\n \t\t<option value="10">10</option>\n \t</select><span class="customSelect" style="display: inline-block;"><span class="customSelectInner" style="width: 49px; display: inline-block;">1</span></span>\n\t\t<button type="button" title="Add to Bag" class="button btn-cart"><span><span>Add to Bag</span></span></button>\n\t\t', 'path': ['div', 'div', 'div', 'div', 'div', 'div', 'form', 'div', 'div', 'div', 'div'], 'element': {'classes': ['add-to-cart'], 'id': '', 'name': 'div'}}, {'computed': {'text-decoration-color': 'rgb(102, 102, 102)', 'outline-color': 'rgb(102, 102, 102)', 'border-left-color': 'rgb(102, 102, 102)', 'perspective-origin': '250px 35px', 'color': 'rgb(102, 102, 102)', 'border-right-color': 'rgb(102, 102, 102)', 'text-emphasis-color': 'rgb(102, 102, 102)', 'transform-origin': '250px 35px', 'text-shadow': 'rgb(102, 102, 102) 0px 0px 0px', 'background-color': 'rgba(0, 0, 0, 0)', 'caret-color': 'rgb(102, 102, 102)', 'border-top-color': 'rgb(102, 102, 102)', 'border-bottom-color': 'rgb(102, 102, 102)', 'line-height': '14px', 'column-rule-color': 'rgb(102, 102, 102)', 'text-align': 'left'}, 'text': [u"Skin is visibly restored by morning, as added\xa0Lavender Essential Oil works to soothe inflamed skin and promote an even skin tone,\xa0 Evening Primrose Oil helps to repair skin and Squalane replenishes skin's\xa0moisture barrier, leaving skin feeling soft, supple and moisturised.\xa0This restoring facial serum improves firmness and elasticity while encouraging a radiant, youthful complexion.\xa0"], 'bound': {'width': 500, 'top': 734.1666870117188, 'height': 70, 'left': 1937.5}, 'selector': [{'classes': ['wrapper'], 'id': '', 'name': 'div'}, {'classes': ['page'], 'id': '', 'name': 'div'}, {'classes': ['col1-layout', 'main'], 'id': '', 'name': 'div'}, {'classes': ['col-main'], 'id': '', 'name': 'div'}, {'classes': [], 'id': '', 'name': 'div'}, {'classes': ['product-view'], 'id': '', 'name': 'div'}, {'classes': [], 'id': 'product_addtocart_form', 'name': 'form'}, {'classes': ['product-collateral'], 'id': '', 'name': 'div'}, {'classes': ['collateral-tabs', 'tab-list'], 'id': 'collateral-tabs', 'name': 'dl'}, {'classes': ['tab-container'], 'id': '', 'name': 'dd'}, {'classes': ['jspScrollable', 'tab-content'], 'id': '', 'name': 'div'}, {'classes': ['jspContainer'], 'id': '', 'name': 'div'}, {'classes': ['jspPane'], 'id': '', 'name': 'div'}, {'classes': ['jspContainer'], 'id': '', 'name': 'div'}, {'classes': ['jspPane'], 'id': '', 'name': 'div'}, {'classes': [], 'id': '', 'name': 'p'}], 'html': "Skin is visibly restored by morning, as added Lavender Essential Oil works to soothe inflamed skin and promote an even skin tone, Evening Primrose Oil helps to repair skin and Squalane replenishes skin's moisture barrier, leaving skin feeling soft, supple and moisturised. This restoring facial serum improves firmness and elasticity while encouraging a radiant, youthful complexion. <br><br>", 'path': ['div', 'div', 'div', 'div', 'div', 'div', 'form', 'div', 'dl', 'dd', 'div', 'div', 'div', 'div', 'div', 'p'], 'element': {'classes': [], 'id': '', 'name': 'p'}}, {'computed': {'text-decoration-color': 'rgb(153, 153, 153)', 'outline-color': 'rgb(153, 153, 153)', 'line-height': '14px', 'vertical-align': 'top', 'perspective-origin': '79px 7px', 'color': 'rgb(153, 153, 153)', 'border-right-color': 'rgb(153, 153, 153)', 'text-emphasis-color': 'rgb(153, 153, 153)', 'transform-origin': '79px 7px', 'text-shadow': 'rgb(153, 153, 153) 0px 0px 0px', 'background-color': 'rgba(0, 0, 0, 0)', 'border-left-color': 'rgb(153, 153, 153)', 'caret-color': 'rgb(153, 153, 153)', 'list-style-type': 'none', 'border-bottom-color': 'rgb(153, 153, 153)', 'border-top-color': 'rgb(153, 153, 153)', 'column-rule-color': 'rgb(153, 153, 153)', 'text-align': 'left'}, 'text': ['Free over '], 'bound': {'width': 158, 'top': 1910.75, 'height': 14, 'left': 1995.5}, 'selector': [{'classes': ['wrapper'], 'id': '', 'name': 'div'}, {'classes': ['page'], 'id': '', 'name': 'div'}, {'classes': ['footer-container'], 'id': '', 'name': 'div'}, {'classes': ['footer'], 'id': '', 'name': 'div'}, {'classes': ['footer-links-icons'], 'id': '', 'name': 'div'}, {'classes': ['footer-links'], 'id': '', 'name': 'div'}, {'classes': [], 'id': '', 'name': 'ul'}, {'classes': [], 'id': '', 'name': 'li'}], 'html': 'Free over ', 'path': ['div', 'div', 'div', 'div', 'div', 'div', 'ul', 'li'], 'element': {'classes': [], 'id': '', 'name': 'li'}}]}]
# PROCESS DATA
processor = Processor(data)
raw_continuous_features, raw_discrete_features = processor.extract()
# ENCODE
continuous_features = np.array(raw_continuous_features)
scaled_continuous_features = preprocessing.scale(continuous_features)
DV = DictVectorizer()
discrete_features = DV.fit_transform(raw_discrete_features).toarray()
features = np.hstack([continuous_features, discrete_features]).astype(np.float32)
# CLUSTER DATA
data = StandardScaler().fit_transform(features)
db = cluster.DBSCAN(eps=0.5, min_samples=1).fit(data)
############################### DBSCAN PLOT DEMO/EXAMPLE ###############################
from sklearn import metrics
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
import matplotlib.pyplot as plt
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = data[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = data[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
if __name__ == '__main__':
main()
感谢任何 help/tips/pointers。
你实际上并没有聚类。对于与数据点一样多的集群,您只有原始数据......DBSCAN 对只有 7 个样本的数据没有多大意义 - 那里什么都没有 "dense"。
但您的实际问题是关于标准定标器的。
如果您将分类属性编码为 0 或 1 个二进制变量,然后应用标准缩放器,0 将变为某个负值,而 1 将变为正值(通常不同)。
现在在你的情况下,只有一个点具有该特定值。
这说明了为什么整个 one-hot 编码和标准缩放方法实际上是一个非常糟糕的 hack。将分类数据与 DBSCAN 一起使用的正确方法是 A) 定义在此数据上定义的距离 - 无需将数据转换为向量 - 或 B) 定义适当的邻居谓词,如 Generalized DBSCAN follow-up paper 中所述额外的控制。
这是我第一次尝试使用 DBSCAN 对我从网页中提取的文本内容块的离散特征(数据点的边界宽度)和连续特征(计算的 css 和数据点的路径)进行聚类。
我有 7 个样本(在第一个数据集中),所以当我将 DBSCAN min_samples 设置为 1 时,这个输出是我所期望的:
- 估计的簇数:7
- 预计噪声点数:0
- 同质性:1.000
- 完整性:1.000
然后我尝试绘制星团以便将它们可视化。对于绘图,我使用了 sklearn example,根据我的数据对其进行了调整。但是,生成的图看起来不太正确。
看起来大多数簇的y轴值都是相同的(-0.408)。我相信这取决于在这一步使用 StandardScaler():
feature_stack = np.hstack([continuous_features, discrete_features])
"""[[-1.31614507 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. ]
[-0.66130166 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. ]"""
features = feature_stack.astype(np.float32)
"""[[-1.3161451 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. ]
[-0.6613017 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. ]"""
# CLUSTER DATA
scaled_data = StandardScaler().fit_transform(features)
# scaled_data
# [[-1.3161452 -0.4082483 0.40824828 2.4494898 2.4494898 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 1.581139 1.581139 -0.4082483 1.1547004 -0.4082483 1.581139 -0.4082483 -0.4082483 -0.6324555 1.581139 -0.4082483 -0.4082483 -0.6324555 -0.4082483 2.4494898 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.6324556 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -1.1547006 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 1.581139 -0.4082483 ]
# [-0.66130173 -0.4082483 0.40824828 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.6324556 1.581139 -0.4082483 1.1547004 -0.4082483 1.581139 -0.4082483 -0.4082483 -0.6324555 1.581139 -0.4082483 -0.4082483 -0.6324555 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.6324556 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -1.1547006 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 -0.4082483 2.4494898 -0.4082483 -0.4082483 -0.6324556 -0.4082483 ]
我可以做些什么来改进我的模型?
这是我获得上图的完整代码(包括注释):
# -*- coding: utf-8 -*-
# Main
import os
import simplejson as json
import random
import processors
import tokenizers
import analyzers
import clusterers
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn import svm, preprocessing, cross_validation
from sklearn.metrics import precision_recall_curve, auc, classification_report, precision_recall_fscore_support
import collections
# Processor
from sklearn import preprocessing
# DBSCAN
from sklearn import cluster
from sklearn.preprocessing import StandardScaler
import numpy as np
class Processor(object):
CONTINUOUS_FEATURES = {
'width': lambda page, datapoint: float(datapoint['bound']['width']),}
def __init__(self, data):
self.data = data
self.pages = []
self.texts = []
for page in self.data:
for text in page['texts']:
self.pages.append(page)
self.texts.append(text)
def extract(self):
continuous_features = []
discrete_features = []
for page, text in zip(self.pages, self.texts):
continuous_features.append([process(page, text) for key, process in self.CONTINUOUS_FEATURES.iteritems()])
discrete_feature = dict(text['computed'].items())
discrete_feature['path'] = ' > '.join(text['path'])
discrete_features.append(discrete_feature)
return continuous_features, discrete_features
def load_data(file):
with open(file) as f:
data = json.load(f)
return data
def main():
data = [{'body': {'scroll': {'top': 0, 'left': 0}, 'bound': {'width': 3983, 'top': 0, 'height': 1526, 'left': 0}}, 'texts': [{'computed': {'font-size': '15px', 'text-decoration-color': 'rgb(0, 0, 0)', 'color': 'rgb(0, 0, 0)', 'transform-origin': '15px 13px', 'margin-right': '10px', 'border-left-color': 'rgb(0, 0, 0)', 'background-repeat': 'no-repeat', 'caret-color': 'rgb(0, 0, 0)', 'border-top-color': 'rgb(0, 0, 0)', 'background-color': 'rgba(0, 0, 0, 0)', 'border-bottom-color': 'rgb(0, 0, 0)', 'outline-color': 'rgb(0, 0, 0)', 'border-right-color': 'rgb(0, 0, 0)', 'text-emphasis-color': 'rgb(0, 0, 0)', 'text-indent': '-9999px', 'unicode-bidi': 'normal', 'text-shadow': 'rgb(0, 0, 0) 0px 0px 0px', 'font-family': 'FuturaLight', 'background-image': 'url("file:///C:/Users/ronaldg/Documents/_Beauty/data/sites/adorebeauty/images/head/heart-icon.svg")', 'perspective-origin': '15px 13px', 'line-height': '20.25px', 'cursor': 'pointer', 'display': 'inline-block', 'column-rule-color': 'rgb(0, 0, 0)'}, 'text': ['Wishlist'], 'bound': {'width': 30, 'top': 30, 'height': 26, 'left': 2305.60009765625}, 'selector': [{'classes': ['wrapper'], 'id': '', 'name': 'div'}, {'classes': ['page'], 'id': '', 'name': 'div'}, {'classes': ['mage-header'], 'id': '', 'name': 'div'}, {'classes': [], 'id': 'header', 'name': 'header'}, {'classes': ['header-section'], 'id': '', 'name': 'div'}, {'classes': ['header-right-block'], 'id': '', 'name': 'div'}, {'classes': ['header-account'], 'id': 'header-account', 'name': 'div'}, {'classes': ['header-wishlist'], 'id': '', 'name': 'a'}], 'html': 'Wishlist', 'path': ['div', 'div', 'div', 'header', 'div', 'div', 'div', 'a'], 'element': {'classes': ['header-wishlist'], 'id': '', 'name': 'a'}}, {'computed': {'font-size': '15px', 'perspective-origin': '72.7px 15px', 'transform-origin': '72.7px 15px', 'display': 'inline-block', 'padding-top': '5px', 'font-family': 'FuturaLight', 'line-height': '20.25px', 'background-color': 'rgba(0, 0, 0, 0)'}, 'text': ['Sign in', ' | ', 'Register'], 'bound': {'width': 145.39999389648438, 'top': 25, 'height': 30, 'left': 2303.60009765625}, 'selector': [{'classes': ['wrapper'], 'id': '', 'name': 'div'}, {'classes': ['page'], 'id': '', 'name': 'div'}, {'classes': ['mage-header'], 'id': '', 'name': 'div'}, {'classes': [], 'id': 'header', 'name': 'header'}, {'classes': ['header-section'], 'id': '', 'name': 'div'}, {'classes': ['header-right-block'], 'id': '', 'name': 'div'}, {'classes': ['header-account'], 'id': 'header-account', 'name': 'div'}], 'html': '\n <!-- -->\n <a href="https://www.adorebeauty.com.au/wishlist/" rel="nofollow" class="header-wishlist" style="border: 1px solid red;">Wishlist</a><a href="https://www.adorebeauty.com.au/customer/account/login/" rel="nofollow" class="login">Sign in</a> | <a href="https://www.adorebeauty.com.au/customer/account/create/" rel="nofollow">Register</a>', 'path': ['div', 'div', 'div', 'header', 'div', 'div', 'div'], 'element': {'classes': ['header-account'], 'id': 'header-account', 'name': 'div'}}, {'computed': {'border-top-style': 'solid', 'font-size': '14px', 'text-decoration-color': 'rgb(255, 255, 255)', 'color': 'rgb(255, 255, 255)', 'letter-spacing': '1px', 'transform-origin': '95.0833px 22.5px', 'padding-bottom': '12px', 'padding-top': '12px', 'border-top-width': '1px', 'border-left-color': 'rgba(0, 0, 0, 0)', 'border-right-style': 'solid', 'padding-right': '18px', 'border-left-style': 'solid', 'caret-color': 'rgb(255, 255, 255)', 'border-top-color': 'rgba(0, 0, 0, 0)', 'background-color': 'rgba(0, 0, 0, 0)', 'border-bottom-color': 'rgb(255, 255, 255)', 'outline-color': 'rgb(255, 255, 255)', 'border-right-color': 'rgba(0, 0, 0, 0)', 'text-emphasis-color': 'rgb(255, 255, 255)', 'unicode-bidi': 'normal', 'text-shadow': 'rgb(255, 255, 255) 0px 0px 0px', 'list-style-type': 'none', 'font-family': 'FuturaLight', 'text-align': 'left', 'perspective-origin': '95.0833px 22.5px', 'cursor': 'pointer', 'border-right-width': '1px', 'column-rule-color': 'rgb(255, 255, 255)', 'text-transform': 'uppercase', 'line-height': '20px', 'border-left-width': '1px', 'padding-left': '18px'}, 'text': ['Shop By Category'], 'bound': {'width': 190.1666717529297, 'top': 80, 'height': 45, 'left': 1499}, 'selector': [{'classes': ['wrapper'], 'id': '', 'name': 'div'}, {'classes': ['page'], 'id': '', 'name': 'div'}, {'classes': ['nav-head'], 'id': '', 'name': 'nav'}, {'classes': ['top-nav'], 'id': 'top-nav', 'name': 'ul'}, {'classes': ['cat-item', 'top'], 'id': '', 'name': 'li'}, {'classes': [], 'id': '', 'name': 'a'}], 'html': 'Shop By Category', 'path': ['div', 'div', 'nav', 'ul', 'li', 'a'], 'element': {'classes': [], 'id': '', 'name': 'a'}}, {'computed': {'font-size': '16px', 'text-decoration-color': 'rgb(20, 179, 88)', 'color': 'rgb(20, 179, 88)', 'transform-origin': '270px 25.5333px', 'padding-bottom': '10px', 'padding-top': '10px', 'border-left-color': 'rgb(20, 179, 88)', 'margin-bottom': '28px', 'padding-right': '10px', 'caret-color': 'rgb(20, 179, 88)', 'border-top-color': 'rgb(20, 179, 88)', 'background-color': 'rgb(234, 248, 248)', 'border-bottom-color': 'rgb(20, 179, 88)', 'outline-color': 'rgb(20, 179, 88)', 'border-right-color': 'rgb(20, 179, 88)', 'text-emphasis-color': 'rgb(20, 179, 88)', 'text-shadow': 'rgb(20, 179, 88) 0px 0px 0px', 'perspective-origin': '270px 25.5333px', 'margin-top': '22px', 'line-height': '21.6px', 'column-rule-color': 'rgb(20, 179, 88)', 'padding-left': '10px'}, 'text': [u'\u2714\ufe0e ', 'In Stock.', '\n We ship today if you order before ', '3 am'], 'bound': {'width': 540, 'top': 479.9666748046875, 'height': 51.05000305175781, 'left': 1921.5}, 'selector': [{'classes': ['wrapper'], 'id': '', 'name': 'div'}, {'classes': ['page'], 'id': '', 'name': 'div'}, {'classes': ['col1-layout', 'main'], 'id': '', 'name': 'div'}, {'classes': ['col-main'], 'id': '', 'name': 'div'}, {'classes': [], 'id': '', 'name': 'div'}, {'classes': ['product-view'], 'id': '', 'name': 'div'}, {'classes': [], 'id': 'product_addtocart_form', 'name': 'form'}, {'classes': ['product-essential'], 'id': '', 'name': 'div'}, {'classes': ['product-shop'], 'id': 'product-shop', 'name': 'div'}, {'classes': ['add-to-box'], 'id': '', 'name': 'div'}, {'classes': ['is-before', 'new-in-stock'], 'id': '', 'name': 'div'}], 'html': u'\n <span><span class="tick">\u2714\ufe0e </span>In Stock.</span>\n We ship today if you order before <span class="time" data-time="1539262800000">3 am</span> ', 'path': ['div', 'div', 'div', 'div', 'div', 'div', 'form', 'div', 'div', 'div', 'div'], 'element': {'classes': ['is-before', 'new-in-stock'], 'id': '', 'name': 'div'}}, {'computed': {'float': 'left', 'transform-origin': '135px 18.5833px', 'perspective-origin': '135px 18.5833px', 'background-color': 'rgba(0, 0, 0, 0)', 'text-align': 'left'}, 'text': ['Qty'], 'bound': {'width': 270, 'top': 561.0166625976562, 'height': 37.15000915527344, 'left': 1921.5}, 'selector': [{'classes': ['wrapper'], 'id': '', 'name': 'div'}, {'classes': ['page'], 'id': '', 'name': 'div'}, {'classes': ['col1-layout', 'main'], 'id': '', 'name': 'div'}, {'classes': ['col-main'], 'id': '', 'name': 'div'}, {'classes': [], 'id': '', 'name': 'div'}, {'classes': ['product-view'], 'id': '', 'name': 'div'}, {'classes': [], 'id': 'product_addtocart_form', 'name': 'form'}, {'classes': ['product-essential'], 'id': '', 'name': 'div'}, {'classes': ['product-shop'], 'id': 'product-shop', 'name': 'div'}, {'classes': ['add-to-box'], 'id': '', 'name': 'div'}, {'classes': ['add-to-cart'], 'id': '', 'name': 'div'}], 'html': '\n\t<label for="qty">Qty</label>\n\t<select name="qty" id="qty" class="hasCustomSelect" style="-webkit-appearance: menulist-button; width: 60px; position: absolute; opacity: 0; height: 36px; font-size: 11px; left: 0px;">\n\t\t<option value="1" selected="">1</option>\n \t\t<option value="2">2</option>\n \t\t<option value="3">3</option>\n \t\t<option value="4">4</option>\n \t\t<option value="5">5</option>\n \t\t<option value="6">6</option>\n \t\t<option value="7">7</option>\n \t\t<option value="8">8</option>\n \t\t<option value="9">9</option>\n \t\t<option value="10">10</option>\n \t</select><span class="customSelect" style="display: inline-block;"><span class="customSelectInner" style="width: 49px; display: inline-block;">1</span></span>\n\t\t<button type="button" title="Add to Bag" class="button btn-cart"><span><span>Add to Bag</span></span></button>\n\t\t', 'path': ['div', 'div', 'div', 'div', 'div', 'div', 'form', 'div', 'div', 'div', 'div'], 'element': {'classes': ['add-to-cart'], 'id': '', 'name': 'div'}}, {'computed': {'text-decoration-color': 'rgb(102, 102, 102)', 'outline-color': 'rgb(102, 102, 102)', 'border-left-color': 'rgb(102, 102, 102)', 'perspective-origin': '250px 35px', 'color': 'rgb(102, 102, 102)', 'border-right-color': 'rgb(102, 102, 102)', 'text-emphasis-color': 'rgb(102, 102, 102)', 'transform-origin': '250px 35px', 'text-shadow': 'rgb(102, 102, 102) 0px 0px 0px', 'background-color': 'rgba(0, 0, 0, 0)', 'caret-color': 'rgb(102, 102, 102)', 'border-top-color': 'rgb(102, 102, 102)', 'border-bottom-color': 'rgb(102, 102, 102)', 'line-height': '14px', 'column-rule-color': 'rgb(102, 102, 102)', 'text-align': 'left'}, 'text': [u"Skin is visibly restored by morning, as added\xa0Lavender Essential Oil works to soothe inflamed skin and promote an even skin tone,\xa0 Evening Primrose Oil helps to repair skin and Squalane replenishes skin's\xa0moisture barrier, leaving skin feeling soft, supple and moisturised.\xa0This restoring facial serum improves firmness and elasticity while encouraging a radiant, youthful complexion.\xa0"], 'bound': {'width': 500, 'top': 734.1666870117188, 'height': 70, 'left': 1937.5}, 'selector': [{'classes': ['wrapper'], 'id': '', 'name': 'div'}, {'classes': ['page'], 'id': '', 'name': 'div'}, {'classes': ['col1-layout', 'main'], 'id': '', 'name': 'div'}, {'classes': ['col-main'], 'id': '', 'name': 'div'}, {'classes': [], 'id': '', 'name': 'div'}, {'classes': ['product-view'], 'id': '', 'name': 'div'}, {'classes': [], 'id': 'product_addtocart_form', 'name': 'form'}, {'classes': ['product-collateral'], 'id': '', 'name': 'div'}, {'classes': ['collateral-tabs', 'tab-list'], 'id': 'collateral-tabs', 'name': 'dl'}, {'classes': ['tab-container'], 'id': '', 'name': 'dd'}, {'classes': ['jspScrollable', 'tab-content'], 'id': '', 'name': 'div'}, {'classes': ['jspContainer'], 'id': '', 'name': 'div'}, {'classes': ['jspPane'], 'id': '', 'name': 'div'}, {'classes': ['jspContainer'], 'id': '', 'name': 'div'}, {'classes': ['jspPane'], 'id': '', 'name': 'div'}, {'classes': [], 'id': '', 'name': 'p'}], 'html': "Skin is visibly restored by morning, as added Lavender Essential Oil works to soothe inflamed skin and promote an even skin tone, Evening Primrose Oil helps to repair skin and Squalane replenishes skin's moisture barrier, leaving skin feeling soft, supple and moisturised. This restoring facial serum improves firmness and elasticity while encouraging a radiant, youthful complexion. <br><br>", 'path': ['div', 'div', 'div', 'div', 'div', 'div', 'form', 'div', 'dl', 'dd', 'div', 'div', 'div', 'div', 'div', 'p'], 'element': {'classes': [], 'id': '', 'name': 'p'}}, {'computed': {'text-decoration-color': 'rgb(153, 153, 153)', 'outline-color': 'rgb(153, 153, 153)', 'line-height': '14px', 'vertical-align': 'top', 'perspective-origin': '79px 7px', 'color': 'rgb(153, 153, 153)', 'border-right-color': 'rgb(153, 153, 153)', 'text-emphasis-color': 'rgb(153, 153, 153)', 'transform-origin': '79px 7px', 'text-shadow': 'rgb(153, 153, 153) 0px 0px 0px', 'background-color': 'rgba(0, 0, 0, 0)', 'border-left-color': 'rgb(153, 153, 153)', 'caret-color': 'rgb(153, 153, 153)', 'list-style-type': 'none', 'border-bottom-color': 'rgb(153, 153, 153)', 'border-top-color': 'rgb(153, 153, 153)', 'column-rule-color': 'rgb(153, 153, 153)', 'text-align': 'left'}, 'text': ['Free over '], 'bound': {'width': 158, 'top': 1910.75, 'height': 14, 'left': 1995.5}, 'selector': [{'classes': ['wrapper'], 'id': '', 'name': 'div'}, {'classes': ['page'], 'id': '', 'name': 'div'}, {'classes': ['footer-container'], 'id': '', 'name': 'div'}, {'classes': ['footer'], 'id': '', 'name': 'div'}, {'classes': ['footer-links-icons'], 'id': '', 'name': 'div'}, {'classes': ['footer-links'], 'id': '', 'name': 'div'}, {'classes': [], 'id': '', 'name': 'ul'}, {'classes': [], 'id': '', 'name': 'li'}], 'html': 'Free over ', 'path': ['div', 'div', 'div', 'div', 'div', 'div', 'ul', 'li'], 'element': {'classes': [], 'id': '', 'name': 'li'}}]}]
# PROCESS DATA
processor = Processor(data)
raw_continuous_features, raw_discrete_features = processor.extract()
# ENCODE
continuous_features = np.array(raw_continuous_features)
scaled_continuous_features = preprocessing.scale(continuous_features)
DV = DictVectorizer()
discrete_features = DV.fit_transform(raw_discrete_features).toarray()
features = np.hstack([continuous_features, discrete_features]).astype(np.float32)
# CLUSTER DATA
data = StandardScaler().fit_transform(features)
db = cluster.DBSCAN(eps=0.5, min_samples=1).fit(data)
############################### DBSCAN PLOT DEMO/EXAMPLE ###############################
from sklearn import metrics
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
import matplotlib.pyplot as plt
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = data[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = data[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
if __name__ == '__main__':
main()
感谢任何 help/tips/pointers。
你实际上并没有聚类。对于与数据点一样多的集群,您只有原始数据......DBSCAN 对只有 7 个样本的数据没有多大意义 - 那里什么都没有 "dense"。
但您的实际问题是关于标准定标器的。
如果您将分类属性编码为 0 或 1 个二进制变量,然后应用标准缩放器,0 将变为某个负值,而 1 将变为正值(通常不同)。
现在在你的情况下,只有一个点具有该特定值。
这说明了为什么整个 one-hot 编码和标准缩放方法实际上是一个非常糟糕的 hack。将分类数据与 DBSCAN 一起使用的正确方法是 A) 定义在此数据上定义的距离 - 无需将数据转换为向量 - 或 B) 定义适当的邻居谓词,如 Generalized DBSCAN follow-up paper 中所述额外的控制。