机器学习---knn之价格预测
生活随笔
收集整理的這篇文章主要介紹了
机器学习---knn之价格预测
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
# -*- coding: UTF-8 -*-
from random import random ,randint import math
def wineprice(rating,age): peak_age = rating - 50 #根據(jù)等級來計(jì)算價(jià)格 price = rating/2 if age > peak_age: # 經(jīng)過峰值年之后,后繼5年內(nèi)品質(zhì)將會變差 price = price*(5-(age - peak_age)) else: # 價(jià)格在接近峰值年時(shí)會增加到原值的5倍 price = price*(5*((age+1)/peak_age)) if price < 0: price = 0 return price
def wineset1(): rows = [] for i in range(300): #隨機(jī)生成年代和等級 rating = random() * 50 + 50 age = random() * 50 # 得到一個參考價(jià)格 price = wineprice(rating, age) # 添加噪聲 price *= (random() * 0.4 + 0.8) #加入數(shù)據(jù)集 rows.append({'input': (rating, age), 'result': price}) return rows
# 定義兩個向量的相似度為歐氏距離 def euclidean(v1,v2): d = 0.0 for i in range(len(v1)): a = v1[i] b = v2[i] d += pow(a-b, 2) return math.sqrt(d)??
# 獲取要預(yù)測的向量vec1與數(shù)據(jù)集data中所有元素的距離 def getdistances(data,vec1): distancelist = [] for i in range(len(data)): vec2 = data[i]['input'] distancelist.append((euclidean(vec1, vec2),i)) distancelist.sort() return distancelist
# knn函數(shù),對列表的前k項(xiàng)結(jié)果求了平均值 def knnestimate(data,vec1,k=5): # 得到經(jīng)過排序的距離值 dlist = getdistances(data, vec1) avg = 0.0 #對前K項(xiàng)結(jié)果求平均值 for i in range(k): idx = dlist[i][1] avg += data[idx]['result'] avg = avg/k return avg??
#反函數(shù) 將距離轉(zhuǎn)換為權(quán)重 def inverseweight(dist,num=1.0,const=0.1): return num/(dist+const)??
#減法函數(shù) def subtractweight(dist,const=1.0): if dist > const: return 0 else: return const - dist? # 高斯函數(shù) def gaussian(dist,sigma = 10.0): return math.e**(-dist**2 / sigma**2)
# 加權(quán)KNN算法,根據(jù)距離對K個近鄰加權(quán),權(quán)值乘以對應(yīng)的價(jià)格作累加最后除以權(quán)值之和 # 參數(shù)weightf是函數(shù),指示使用哪一種權(quán)值衰減方式 # 試驗(yàn)得出,k=3時(shí) 誤差最小 def weightedKnn(data, vec1, k=3, weightf = gaussian): dlist = getdistances(data, vec1) result = 0.0 weight = 0.0 for i in range(k): price = data[dlist[i][1]]['result'] # 價(jià)格 result += price * weightf(dlist[i][0]) # 距離加權(quán),累加價(jià)格和 weight += weightf(dlist[i][0])? ? ? ? ?# 統(tǒng)計(jì)權(quán)值和 return result / weight #交叉驗(yàn)證 # 1 隨機(jī)劃分?jǐn)?shù)據(jù)集,test指定了測試集所占的比例 # 典型的情況下,測試集只會包含一小部分?jǐn)?shù)據(jù),大概是所有數(shù)據(jù)的5%,剩下的95%都是訓(xùn)練集 def dividedata(data, test=0.05): trainset = [] testset = [] for row in data: if random() < test: testset.append(row) else: trainset.append(row) return trainset,testset??
# 2 對測試集進(jìn)行預(yù)測算出誤差,針對測試集中的每一項(xiàng)內(nèi)容調(diào)用算法,返回誤差 #? 其中參數(shù)algf是一個函數(shù),可以是 knnestimate,weightedknn或者其他計(jì)算價(jià)格的函數(shù) def testalgorithm(algf, trainset, testset): error = 0.0 for row in testset: # 對測試集的每一項(xiàng)數(shù)據(jù)都進(jìn)行預(yù)測 guess = algf(trainset, row['input']) # 對預(yù)測結(jié)果與正確結(jié)果進(jìn)行做差,得出誤差 error += (row['result'] - guess) **2? return error / len(testset)? ? ?
# 3 交叉驗(yàn)證 多次調(diào)用dividedata函數(shù)對數(shù)據(jù)進(jìn)行隨機(jī)劃分,并計(jì)算誤差,取所有隨機(jī)劃分的均值 def crossvalidate(algf, data, trials = 100, test = 0.05): error = 0.0 #trials 代表隨機(jī)劃分的次數(shù) for i in range(trials): trainset,testset = dividedata(data, test) #100多次的交叉驗(yàn)證之后,對累計(jì)的誤差求平均值 error += testalgorithm(algf, trainset, testset) return error / trials
#重新生成數(shù)據(jù)集,加入干擾變量 def wineset2(): rows = [] for i in range(300): rating = random() * 50 + 50 age = random() * 50 aisle = float(randint(1,20)) bottleszie = [375.0, 750.0, 1500.0, 3000.0][randint(0, 3)] price = wineprice(rating, age) price *= (bottleszie / 750) price*=(random()*0.9+0.2) rows.append({'input': (rating, age, aisle, bottleszie), 'result': price}) return rows
# 縮放,參數(shù)scale的長度與訓(xùn)練數(shù)據(jù)特征的長度相同. 每個參數(shù)乘以訓(xùn)練數(shù)據(jù)中的特征以達(dá)到縮放特征的目的 def rescale(data, scale): scaledata = [] for row in data: scaled = [scale[i] * row['input'][i] for i in range(len(scale))] scaledata.append({'input': scaled, 'result': row['result']}) return scaledata
# 構(gòu)造 優(yōu)化搜索算法 的代價(jià)函數(shù) def createcostfunction(algf, data): def costf(scale): sdata = rescale(data, scale) return crossvalidate(algf, data) return costf
if __name__ == '__main__': #構(gòu)造數(shù)據(jù) data = wineset1()? print data print getdistances(data, (99.0,5.0)) #價(jià)格預(yù)測 print '----------------------------------k-最近鄰算法---------------------------------------' print knnestimate(data, (99.0,5.0)) print '----------------------------------加權(quán) k-最近鄰算法---------------------------------------' # 優(yōu)化knnestimate函數(shù),對不同距離的近鄰進(jìn)行距離加權(quán) print weightedKnn(data, (99.0,5.0)) # 交叉驗(yàn)證 print '----------------------------------交叉驗(yàn)證,誤差均值---------------------------------------' print crossvalidate(weightedKnn, data)
from random import random ,randint import math
def wineprice(rating,age): peak_age = rating - 50 #根據(jù)等級來計(jì)算價(jià)格 price = rating/2 if age > peak_age: # 經(jīng)過峰值年之后,后繼5年內(nèi)品質(zhì)將會變差 price = price*(5-(age - peak_age)) else: # 價(jià)格在接近峰值年時(shí)會增加到原值的5倍 price = price*(5*((age+1)/peak_age)) if price < 0: price = 0 return price
def wineset1(): rows = [] for i in range(300): #隨機(jī)生成年代和等級 rating = random() * 50 + 50 age = random() * 50 # 得到一個參考價(jià)格 price = wineprice(rating, age) # 添加噪聲 price *= (random() * 0.4 + 0.8) #加入數(shù)據(jù)集 rows.append({'input': (rating, age), 'result': price}) return rows
# 定義兩個向量的相似度為歐氏距離 def euclidean(v1,v2): d = 0.0 for i in range(len(v1)): a = v1[i] b = v2[i] d += pow(a-b, 2) return math.sqrt(d)??
# 獲取要預(yù)測的向量vec1與數(shù)據(jù)集data中所有元素的距離 def getdistances(data,vec1): distancelist = [] for i in range(len(data)): vec2 = data[i]['input'] distancelist.append((euclidean(vec1, vec2),i)) distancelist.sort() return distancelist
# knn函數(shù),對列表的前k項(xiàng)結(jié)果求了平均值 def knnestimate(data,vec1,k=5): # 得到經(jīng)過排序的距離值 dlist = getdistances(data, vec1) avg = 0.0 #對前K項(xiàng)結(jié)果求平均值 for i in range(k): idx = dlist[i][1] avg += data[idx]['result'] avg = avg/k return avg??
#反函數(shù) 將距離轉(zhuǎn)換為權(quán)重 def inverseweight(dist,num=1.0,const=0.1): return num/(dist+const)??
#減法函數(shù) def subtractweight(dist,const=1.0): if dist > const: return 0 else: return const - dist? # 高斯函數(shù) def gaussian(dist,sigma = 10.0): return math.e**(-dist**2 / sigma**2)
# 加權(quán)KNN算法,根據(jù)距離對K個近鄰加權(quán),權(quán)值乘以對應(yīng)的價(jià)格作累加最后除以權(quán)值之和 # 參數(shù)weightf是函數(shù),指示使用哪一種權(quán)值衰減方式 # 試驗(yàn)得出,k=3時(shí) 誤差最小 def weightedKnn(data, vec1, k=3, weightf = gaussian): dlist = getdistances(data, vec1) result = 0.0 weight = 0.0 for i in range(k): price = data[dlist[i][1]]['result'] # 價(jià)格 result += price * weightf(dlist[i][0]) # 距離加權(quán),累加價(jià)格和 weight += weightf(dlist[i][0])? ? ? ? ?# 統(tǒng)計(jì)權(quán)值和 return result / weight #交叉驗(yàn)證 # 1 隨機(jī)劃分?jǐn)?shù)據(jù)集,test指定了測試集所占的比例 # 典型的情況下,測試集只會包含一小部分?jǐn)?shù)據(jù),大概是所有數(shù)據(jù)的5%,剩下的95%都是訓(xùn)練集 def dividedata(data, test=0.05): trainset = [] testset = [] for row in data: if random() < test: testset.append(row) else: trainset.append(row) return trainset,testset??
# 2 對測試集進(jìn)行預(yù)測算出誤差,針對測試集中的每一項(xiàng)內(nèi)容調(diào)用算法,返回誤差 #? 其中參數(shù)algf是一個函數(shù),可以是 knnestimate,weightedknn或者其他計(jì)算價(jià)格的函數(shù) def testalgorithm(algf, trainset, testset): error = 0.0 for row in testset: # 對測試集的每一項(xiàng)數(shù)據(jù)都進(jìn)行預(yù)測 guess = algf(trainset, row['input']) # 對預(yù)測結(jié)果與正確結(jié)果進(jìn)行做差,得出誤差 error += (row['result'] - guess) **2? return error / len(testset)? ? ?
# 3 交叉驗(yàn)證 多次調(diào)用dividedata函數(shù)對數(shù)據(jù)進(jìn)行隨機(jī)劃分,并計(jì)算誤差,取所有隨機(jī)劃分的均值 def crossvalidate(algf, data, trials = 100, test = 0.05): error = 0.0 #trials 代表隨機(jī)劃分的次數(shù) for i in range(trials): trainset,testset = dividedata(data, test) #100多次的交叉驗(yàn)證之后,對累計(jì)的誤差求平均值 error += testalgorithm(algf, trainset, testset) return error / trials
#重新生成數(shù)據(jù)集,加入干擾變量 def wineset2(): rows = [] for i in range(300): rating = random() * 50 + 50 age = random() * 50 aisle = float(randint(1,20)) bottleszie = [375.0, 750.0, 1500.0, 3000.0][randint(0, 3)] price = wineprice(rating, age) price *= (bottleszie / 750) price*=(random()*0.9+0.2) rows.append({'input': (rating, age, aisle, bottleszie), 'result': price}) return rows
# 縮放,參數(shù)scale的長度與訓(xùn)練數(shù)據(jù)特征的長度相同. 每個參數(shù)乘以訓(xùn)練數(shù)據(jù)中的特征以達(dá)到縮放特征的目的 def rescale(data, scale): scaledata = [] for row in data: scaled = [scale[i] * row['input'][i] for i in range(len(scale))] scaledata.append({'input': scaled, 'result': row['result']}) return scaledata
# 構(gòu)造 優(yōu)化搜索算法 的代價(jià)函數(shù) def createcostfunction(algf, data): def costf(scale): sdata = rescale(data, scale) return crossvalidate(algf, data) return costf
if __name__ == '__main__': #構(gòu)造數(shù)據(jù) data = wineset1()? print data print getdistances(data, (99.0,5.0)) #價(jià)格預(yù)測 print '----------------------------------k-最近鄰算法---------------------------------------' print knnestimate(data, (99.0,5.0)) print '----------------------------------加權(quán) k-最近鄰算法---------------------------------------' # 優(yōu)化knnestimate函數(shù),對不同距離的近鄰進(jìn)行距離加權(quán) print weightedKnn(data, (99.0,5.0)) # 交叉驗(yàn)證 print '----------------------------------交叉驗(yàn)證,誤差均值---------------------------------------' print crossvalidate(weightedKnn, data)
總結(jié)
以上是生活随笔為你收集整理的机器学习---knn之价格预测的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 顺丰速运怎么这么慢(顺丰快递单号查询)
- 下一篇: istio-0.8 服务超时配置