frequent_itemsets[1] = dict((frozenset((movie_id,)), row['favorable']) for movie_id, row in num_favorable_by_movie.iterrows() if row["favorable"] > 50)
无论是列表推导式和字典推导式在这次复现都会大量出现。
我们创建下面的这样一个函数:
1 2 3 4 5 6 7 8 9 10 11
deffind_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support): counts = defaultdict(int) for user, reviews in favorable_reviews_by_users.items(): for itemset in k_1_itemsets: if itemset.issubset(reviews): for other_reviewed_movie in reviews - itemset: current_superset = itemset | frozenset((other_reviewed_movie,))) counts[current_superset] += 1 return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])
return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])
最后在return函数检测频繁程度,返回其中的频繁项集。
一个简单的k维for循环:
1 2 3 4 5
for k in range(2, 20): cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1], min_support=min_support) frequent_itemsets[k] = cur_frequent_itemsets
for itemset_length, itemset_counts in frequent_itemsets.items(): for itemset in itemset_counts.keys(): for conclusion in itemset: premise = itemset - set((conclusion,)) candidate_rules.append((premise, conclusion))
test_dataset = all_ratings[~all_ratings['userId'].isin(range(200))] test_favorable = test_dataset[test_dataset["favorable"]] test_favorable_by_users = dict((k, frozenset(v.values)) for k, v in test_favorable.groupby("userId")["movieId"])
for user, reviews in test_favorable_by_users.items(): for candidate_rule in candidate_rules: premise, conclusion = candidate_rule if premise.issubset(reviews): if conclusion in reviews: correct_counts[candidate_rule] += 1 else: incorrect_counts[candidate_rule] += 1
test_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule]) for candidate_rule in rule_confidence}