From a3a3472231f88abc7483efcfe6d5c6318445a6b9 Mon Sep 17 00:00:00 2001 From: benjas <909336740@qq.com> Date: Tue, 16 Feb 2021 13:58:35 +0800 Subject: [PATCH] Add. Similarity calculation --- .../酒店推荐.ipynb | 152 +++++++++++++++--- 1 file changed, 131 insertions(+), 21 deletions(-) diff --git a/机器学习竞赛实战_优胜解决方案/基于相似度的酒店推荐系统/酒店推荐.ipynb b/机器学习竞赛实战_优胜解决方案/基于相似度的酒店推荐系统/酒店推荐.ipynb index 32fcdde..53950ff 100644 --- a/机器学习竞赛实战_优胜解决方案/基于相似度的酒店推荐系统/酒店推荐.ipynb +++ b/机器学习竞赛实战_优胜解决方案/基于相似度的酒店推荐系统/酒店推荐.ipynb @@ -2856,18 +2856,6 @@ "plt.show() # 绝大多数是250内的,不会是太长的" ] }, - { - "cell_type": "code", - "execution_count": 121, - "metadata": {}, - "outputs": [], - "source": [ - "# 过滤掉不需要保留的\n", - "from nltk.corpus import stopwords\n", - "\n", - "set_stopwords = set(stopwords.words('english'))" - ] - }, { "cell_type": "code", "execution_count": 132, @@ -2882,7 +2870,7 @@ }, { "cell_type": "code", - "execution_count": 149, + "execution_count": 153, "metadata": {}, "outputs": [ { @@ -2988,22 +2976,26 @@ "4 situated amid incredible shopping iconic attra... " ] }, - "execution_count": 149, + "execution_count": 153, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "# 过滤掉不需要保留的\n", + "from nltk.corpus import stopwords\n", + "set_stopwords = set(stopwords.words('english'))\n", + "\n", "df['desc_clean'] = df['desc'].str.lower() # 全部转小写\n", "df['desc_clean'] = df['desc_clean'].apply(clean_txt)\n", - "df['desc_clean'] = df['desc_clean'].str.split(' ').apply(lambda x: ' '.join(k for k in x if k not in set_stopwords))\n", - "# df['desc_clean'] = df['desc_clean'].apply(clean_txt)\n", + "df['desc_clean'] = df['desc_clean'].str.split(' ').apply(lambda x: ' '.join(k for k in x if k not in set_stopwords)) # 去掉停用词\n", + "\n", "df.head()" ] }, { "cell_type": "code", - "execution_count": 150, + "execution_count": 154, "metadata": {}, "outputs": [ { @@ -3012,18 +3004,18 @@ "\"Located on the southern tip of Lake Union, the Hilton Garden Inn Seattle Downtown hotel is perfectly located for business and leisure. \\nThe neighborhood is home to numerous major international companies including Amazon, Google and the Bill & Melinda Gates Foundation. A wealth of eclectic restaurants and bars make this area of Seattle one of the most sought out by locals and visitors. Our proximity to Lake Union allows visitors to take in some of the Pacific Northwest's majestic scenery and enjoy outdoor activities like kayaking and sailing. over 2,000 sq. ft. of versatile space and a complimentary business center. State-of-the-art A/V technology and our helpful staff will guarantee your conference, cocktail reception or wedding is a success. Refresh in the sparkling saltwater pool, or energize with the latest equipment in the 24-hour fitness center. Tastefully decorated and flooded with natural light, our guest rooms and suites offer everything you need to relax and stay productive. Unwind in the bar, and enjoy American cuisine for breakfast, lunch and dinner in our restaurant. The 24-hour Pavilion Pantry? stocks a variety of snacks, drinks and sundries.\"" ] }, - "execution_count": 150, + "execution_count": 154, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df['desc'][0]" + "df['desc'][0] # 比较两者的差异" ] }, { "cell_type": "code", - "execution_count": 151, + "execution_count": 155, "metadata": {}, "outputs": [ { @@ -3032,7 +3024,7 @@ "'located southern tip lake union hilton garden inn seattle downtown hotel perfectly located business leisure neighborhood home numerous major international companies including amazon google bill melinda gates foundation wealth eclectic restaurants bars make area seattle one sought locals visitors proximity lake union allows visitors take pacific northwest majestic scenery enjoy outdoor activities like kayaking sailing 2 000 sq ft versatile space complimentary business center state art v technology helpful staff guarantee conference cocktail reception wedding success refresh sparkling saltwater pool energize latest equipment 24 hour fitness center tastefully decorated flooded natural light guest rooms suites offer everything need relax stay productive unwind bar enjoy american cuisine breakfast lunch dinner restaurant 24 hour pavilion pantry stocks variety snacks drinks sundries '" ] }, - "execution_count": 151, + "execution_count": 155, "metadata": {}, "output_type": "execute_result" } @@ -3041,6 +3033,124 @@ "df['desc_clean'][0]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 相似度计算" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": {}, + "outputs": [], + "source": [ + "df.set_index('name', inplace=True) # 把name变成索引" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "metadata": {}, + "outputs": [], + "source": [ + "# 计算每个词的权重水平\n", + "tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), stop_words='english')\n", + "tfidf_matrix = tf.fit_transform(df['desc_clean']) # 转换当前数据" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(152, 26631)" + ] + }, + "execution_count": 161, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tfidf_matrix.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(152, 152)" + ] + }, + "execution_count": 165, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "consine_similarity = linear_kernel(tfidf_matrix, tfidf_matrix) # 计算相似度\n", + "consine_similarity.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1. , 0.01406466, 0.03391973, 0.00993816, 0.03246863,\n", + " 0.01501356, 0.02084233, 0.01581231, 0.00776991, 0.01999756,\n", + " 0.0182464 , 0.01231142, 0.01684817, 0.0119307 , 0.01085672,\n", + " 0.01791009, 0.0111671 , 0.04070581, 0.00971403, 0.02608081,\n", + " 0.03035044, 0.00885341, 0.01056546, 0.02009413, 0.01868132,\n", + " 0.02816165, 0.0321467 , 0.00681797, 0.02538754, 0.01969646,\n", + " 0.01638717, 0.04434173, 0.0167791 , 0.02169556, 0.03728075,\n", + " 0.03902235, 0.0069193 , 0.01352541, 0.04098731, 0.03227337,\n", + " 0.0172481 , 0.01166389, 0.01520804, 0.03544255, 0.04699436,\n", + " 0.01310661, 0.03274589, 0.0161937 , 0.03786155, 0.01421505,\n", + " 0.0266454 , 0.01830098, 0.03764235, 0.01329187, 0.02744756,\n", + " 0.01454037, 0.02460386, 0.03082779, 0.01229374, 0.02683908,\n", + " 0.03151467, 0.01008901, 0.04523004, 0.0312478 , 0.0323932 ,\n", + " 0.01846074, 0.03120115, 0.01118123, 0.02208553, 0.01201834,\n", + " 0.02355357, 0.01679123, 0.02597236, 0.02219805, 0.02335901,\n", + " 0.04484254, 0.00131829, 0.02258004, 0.01596417, 0.02875198,\n", + " 0.00728455, 0.01550146, 0.00586358, 0.00886017, 0.01505134,\n", + " 0.04805398, 0.01154452, 0.00439089, 0.00890586, 0.01341109,\n", + " 0.00761107, 0.00443603, 0.0146058 , 0.00493675, 0.01795282,\n", + " 0.01702045, 0.01116872, 0.02318485, 0.01508132, 0.02823554,\n", + " 0.01212307, 0.00548954, 0.00335406, 0.02440467, 0.00912747,\n", + " 0.02412254, 0.04179826, 0.02109056, 0.01228275, 0.03570519,\n", + " 0.05331295, 0.00886831, 0.0258668 , 0.01566466, 0.0267365 ,\n", + " 0.07529637, 0.01660016, 0.0371029 , 0.0114389 , 0.01876546,\n", + " 0.00671789, 0.01194306, 0.01871489, 0.00346884, 0.00876216,\n", + " 0.00946862, 0.04517183, 0.07370297, 0.00884079, 0.01411685,\n", + " 0.01406232, 0.0124469 , 0.02123197, 0.01859324, 0.02939583,\n", + " 0.00481356, 0.0358775 , 0.01307147, 0.0136874 , 0.01567845,\n", + " 0.01888209, 0.02270796, 0.02684905, 0.01715449, 0.00317041,\n", + " 0.00237712, 0.0237994 , 0.00739057, 0.00643772, 0.01595671,\n", + " 0.00239758, 0.00730286])" + ] + }, + "execution_count": 166, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "consine_similarity[0] # 第0个与全部矩阵内容的相似度计算" + ] + }, { "cell_type": "code", "execution_count": null,