!pip install awscli tqdm mlxtend

import numpy as np
import pandas as pd
import csv
import os
from tqdm import tqdm
import glob

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

import gc

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

%matplotlib inline

np.random.seed(42)

categories = [

# US REVIEWS DATASET:
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Wireless_v1_00.tsv.gz',                      # size: 1.59 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Watches_v1_00.tsv.gz',                       # size: 0.15 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Video_Games_v1_00.tsv.gz',                   # size: 0.44 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Video_DVD_v1_00.tsv.gz',                     # size: 1.41 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Video_v1_00.tsv.gz',                         # size: 0.13 GB.
  's3://amazon-reviews-pds/tsv/amazon_reviews_us_Toys_v1_00.tsv.gz',                          # size: 0.78 GB.
  's3://amazon-reviews-pds/tsv/amazon_reviews_us_Tools_v1_00.tsv.gz',                         # size: 0.31 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Sports_v1_00.tsv.gz',                        # size: 0.81 GB.
  's3://amazon-reviews-pds/tsv/amazon_reviews_us_Software_v1_00.tsv.gz',                      # size: 0.09 GB.
  's3://amazon-reviews-pds/tsv/amazon_reviews_us_Shoes_v1_00.tsv.gz',                         # size: 0.60 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Pet_Products_v1_00.tsv.gz',                  # size: 0.48 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Personal_Care_Appliances_v1_00.tsv.gz',      # size: 0.02 GB.
  's3://amazon-reviews-pds/tsv/amazon_reviews_us_PC_v1_00.tsv.gz',                            # size: 1.41 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Outdoors_v1_00.tsv.gz',                      # size: 0.42 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Office_Products_v1_00.tsv.gz',               # size: 0.48 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz',           # size: 0.18 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Music_v1_00.tsv.gz',                         # size: 1.42 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Mobile_Electronics_v1_00.tsv.gz',            # size: 0.02 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Mobile_Apps_v1_00.tsv.gz',                   # size: 0.52 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Major_Appliances_v1_00.tsv.gz',              # size: 0.02 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Luggage_v1_00.tsv.gz',                       # size: 0.06 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Lawn_and_Garden_v1_00.tsv.gz',               # size: 0.45 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Kitchen_v1_00.tsv.gz',                       # size: 0.87 GB.
  's3://amazon-reviews-pds/tsv/amazon_reviews_us_Jewelry_v1_00.tsv.gz',                       # size: 0.23 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Home_Improvement_v1_00.tsv.gz',              # size: 0.47 GB.       
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Home_Entertainment_v1_00.tsv.gz',            # size: 0.18 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Home_v1_00.tsv.gz',                          # size: 1.01 GB.
  's3://amazon-reviews-pds/tsv/amazon_reviews_us_Health_Personal_Care_v1_00.tsv.gz',          # size: 0.94 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Grocery_v1_00.tsv.gz',                       # size: 0.37 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Gift_Card_v1_00.tsv.gz',                     # size: 0.01 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Furniture_v1_00.tsv.gz',                     # size: 0.14 GB.
  's3://amazon-reviews-pds/tsv/amazon_reviews_us_Electronics_v1_00.tsv.gz',                   # size: 0.65 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz',           # size: 0.03 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Video_Download_v1_00.tsv.gz',        # size: 0.47 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz',              # size: 0.02 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Music_Purchase_v1_00.tsv.gz',        # size: 0.24 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tsv.gz',        # size: 1.21 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Ebook_Purchase_v1_00.tsv.gz',        # size: 2.51 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Camera_v1_00.tsv.gz',                        # size: 0.41 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Books_v1_02.tsv.gz',                         # size: 1.24 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Books_v1_01.tsv.gz',                         # size: 2.51 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Books_v1_00.tsv.gz',                         # size: 2.55 GB.
  's3://amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz',                        # size: 0.85 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Baby_v1_00.tsv.gz',                          # size: 0.33 GB.
# 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Automotive_v1_00.tsv.gz',                    # size: 0.54 GB.
  's3://amazon-reviews-pds/tsv/amazon_reviews_us_Apparel_v1_00.tsv.gz',                       # size: 0.60 GB.

# MULTILINGUAL REVIEWS DATASET:
#'s3://amazon-reviews-pds/tsv/amazon_reviews_multilingual_US_v1_00.tsv.gz',                  # size: 1.37 GB.
#'s3://amazon-reviews-pds/tsv/amazon_reviews_multilingual_UK_v1_00.tsv.gz',                  # size: 0.33 GB.
#'s3://amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz',                  # size: 0.09 GB.
#'s3://amazon-reviews-pds/tsv/amazon_reviews_multilingual_FR_v1_00.tsv.gz',                  # size: 0.07 GB.
#'s3://amazon-reviews-pds/tsv/amazon_reviews_multilingual_DE_v1_00.tsv.gz'                   # size: 0.23 GB.
]


def download_everything(categories):

  # Aprox: ~ 30.15 GB for US.
  #        ~  2.07 GB for Multiligual.
  for category in tqdm(categories, ncols=80):

    # file name: 'us_category_v1_xx.tsv.gz'
    name = category.split('/')[-1].split('_')[2:]
    name = '_'.join(name)

    # Skip if exists, eiste giannides eimai sigouros tha kanete malakia
    if os.path.isfile(f'data/{name}'):
      continue
    else:
      # download into 'data/'
      os.system(f'aws s3 cp {category} data/{name} --no-sign-request')


def request_dataframe(file, usecols=np.arange(15), keep_verified=False, clean=True):

  # DATA COLUMNS:
  # 0:  marketplace       - 2 letter country code of the marketplace where the review was written.
  # 1:  customer_id       - Random identifier that can be used to aggregate reviews written by a single author.
  # 2:  review_id         - The unique ID of the review.
  # 3:  product_id        - The unique Product ID the review pertains to. In the multilingual dataset the reviews
  #                         for the same product in different countries can be grouped by the same product_id.
  # 4:  product_parent    - Random identifier that can be used to aggregate reviews for the same product.
  # 5:  product_title     - Title of the product.
  # 6:  product_category  - Broad product category that can be used to group reviews 
  #                         (also used to group the dataset into coherent parts).
  # 7:  star_rating       - The 1-5 star rating of the review.
  # 8:  helpful_votes     - Number of helpful votes.
  # 9:  total_votes       - Number of total votes the review received.
  # 10: vine              - Review was written as part of the Vine program.
  # 11: verified_purchase - The review is on a verified purchase.
  # 12: review_headline   - The title of the review.
  # 13: review_body       - The review text.
  # 14: review_date       - The date the review was written.

  names = [
         'marketplace', 'customer_id', 'review_id', 'product_id',
         'product_parent', 'product_title', 'product_category', 'star_rating',
         'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
         'review_headline', 'review_body', 'review_date'
  ]

  # dtypes ranges
  # int8:     (-128 to 127)
  # int16:    (-32768 to 32767)
  # int32:    (-2147483648 to 2147483647)
  # uint8:    (0 to 255)
  # uint16:   (0 to 65535)
  # uint32:   ((0 to 4294967295)
  # 'S', 'a':	(byte-)string
  # 'U':      (unicode)

  dtypes = {
      'marketplace': str, 'customer_id': np.uint32, 'review_id': str,
      'product_id': str, 'product_parent': np.uint32, 'product_title': str,
      'product_category': str, 'star_rating': np.uint8, 'helpful_votes': np.uint16,
      'total_votes': np.uint16, 'vine': str, 'verified_purchase': str,
      'review_headline': str, 'review_body': str, 'review_date': str
    }

  # Load Data
  df = pd.read_csv(file, compression='gzip', names=names, dtype=dtypes,
                   sep='\t', header=0, usecols=np.arange(15), quoting=csv.QUOTE_NONE,
                   error_bad_lines=False, skip_blank_lines=True,
                   parse_dates=['review_date'], infer_datetime_format=True)

  if clean:
    # Drop row if any of columns is NaN
    df.dropna(inplace=True)

  # Keep only verified
  if keep_verified:
    df = df[df.verified_purchase == 'Y']

  # Always reset indices after removing rows
  df.reset_index(drop=True, inplace=True)
  df = df.iloc[:, usecols]

  return df


def create_df(usecols=np.arange(15), clean=True):
  total_df = []
  files = [file for file in glob.glob(f'data/*')]

  for file in tqdm(files, ncols=80):
    df = request_dataframe(file=file, usecols=usecols, clean=clean) 

    # Append to main Df 
    total_df.append(df)
  
  total_df = pd.concat(total_df)
  
  return total_df

Requirement already satisfied: awscli in /usr/local/lib/python3.6/dist-packages (1.18.16)
Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (4.28.1)
Requirement already satisfied: mlxtend in /usr/local/lib/python3.6/dist-packages (0.14.0)
Requirement already satisfied: docutils<0.16,>=0.10 in /usr/local/lib/python3.6/dist-packages (from awscli) (0.15.2)
Requirement already satisfied: botocore==1.15.16 in /usr/local/lib/python3.6/dist-packages (from awscli) (1.15.16)
Requirement already satisfied: PyYAML<5.4,>=3.10; python_version != "3.4" in /usr/local/lib/python3.6/dist-packages (from awscli) (3.13)
Requirement already satisfied: colorama<0.4.4,>=0.2.5; python_version != "3.4" in /usr/local/lib/python3.6/dist-packages (from awscli) (0.4.3)
Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /usr/local/lib/python3.6/dist-packages (from awscli) (0.3.3)
Requirement already satisfied: rsa<=3.5.0,>=3.1.2 in /usr/local/lib/python3.6/dist-packages (from awscli) (3.4.2)
Requirement already satisfied: pandas>=0.17.1 in /usr/local/lib/python3.6/dist-packages (from mlxtend) (0.25.3)
Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from mlxtend) (45.2.0)
Requirement already satisfied: scipy>=0.17 in /usr/local/lib/python3.6/dist-packages (from mlxtend) (1.4.1)
Requirement already satisfied: numpy>=1.10.4 in /usr/local/lib/python3.6/dist-packages (from mlxtend) (1.17.5)
Requirement already satisfied: scikit-learn>=0.18 in /usr/local/lib/python3.6/dist-packages (from mlxtend) (0.22.1)
Requirement already satisfied: matplotlib>=1.5.1 in /usr/local/lib/python3.6/dist-packages (from mlxtend) (3.1.3)
Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from botocore==1.15.16->awscli) (0.9.4)
Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.6/dist-packages (from botocore==1.15.16->awscli) (2.6.1)
Requirement already satisfied: urllib3<1.26,>=1.20; python_version != "3.4" in /usr/local/lib/python3.6/dist-packages (from botocore==1.15.16->awscli) (1.24.3)
Requirement already satisfied: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<=3.5.0,>=3.1.2->awscli) (0.4.8)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.17.1->mlxtend) (2018.9)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn>=0.18->mlxtend) (0.14.1)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=1.5.1->mlxtend) (1.1.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=1.5.1->mlxtend) (2.4.6)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=1.5.1->mlxtend) (0.10.0)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore==1.15.16->awscli) (1.12.0)

# Download
download_everything(categories)

100%|███████████████████████████████████████████| 10/10 [02:38<00:00, 16.40s/it]

# Import libraries
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
import math

import matplotlib.patches as mpatches

df_list=create_df([1,2,6,8,13])

100%|███████████████████████████████████████████| 10/10 [08:04<00:00, 42.05s/it]

def feature_extraction1(df_list):

  # Unique reviews per person
  temp=df_list.groupby(['customer_id'])[['review_id']].count().copy()
  temp.columns = ['#reviews']

  # Reviews per person CDF plot
  kwargs = {'cumulative': True}
  sns.distplot(temp[1:len(temp):50]['#reviews'], hist_kws=kwargs, kde_kws=kwargs,hist=False)
  plt.xlim(0,30)
  plt.yticks((np.arange(0, 1.1, step=0.1)))
  plt.xlabel('Num of Reviews')
  plt.ylabel('% Percent')
  plt.title('CDF plot, Num of Reviews per Person')
  plt.grid(True)
  plt.savefig('CDF_reviews.png',dpi=100)

  # Unique reviewers per category
  temp=df_list.groupby(['product_category'])['customer_id'].nunique()
  g=sns.barplot(x=temp.index.tolist(), y=list(temp[:]))
  g.set_xticklabels(labels=temp.index.tolist(),rotation=90)
  plt.title('Unique Reviewers per Category')
  plt.tight_layout()
  plt.savefig('Unique_reviewers_percategory.png',dpi=100)
  plt.show(g)

  # GroupBy customer_id and aggregate by helpfull Votes,Voted Categories, Num of reviews
  a = df_list.groupby(['customer_id']).agg(
  {'product_category': [('voted_categories', pd.Series.unique)],
   'helpful_votes': [('total_helpful', 'sum')],
   'review_id': [('reviews_total', 'count')]})

  print('ok')
  a.columns = a.columns.get_level_values(1)
  a.reset_index(inplace=True)

  # mean len of review
  a['str_len'] = a.review_body.str.split(' ').len()
  return a



def text_feature_extraction(df_list):
  cat = []
  count_categ_dec = {}
  corpus_categ_dec = []
  corpus_review_headline = []
  for i,df in enumerate(df_list):
    cat.append(df.iloc[0]['product_category']) 
    # if i >= 1:
    #   break
    # Keep only the usefull columns
    reviews = df[['review_headline','review_body','review_date']].copy()
    reviews['review_date'] = pd.to_datetime(reviews['review_date'],unit='D')
    # Keep only some reviews headlines
    if len(reviews)<100000:
      corpus_review_headline.append(reviews['review_headline'])
    else:
      corpus_review_headline.append(reviews[::math.floor(len(reviews)/100000)]['review_headline'])

    # Keep in every decate some reviews
    decades = ['1990-01-01','2000-01-01','2010-01-01','2020-01-01']
    count_dec = []
    corpus_categ = []
    for k in range(len(decades)):
      if k > 2:
        break
      part = reviews[(pd.to_datetime(decades[k])<reviews['review_date']) & (reviews['review_date']<=pd.to_datetime(decades[k+1]))].copy()
      part.reset_index(drop=True, inplace=True)
      count_dec.append(len(part))
      if len(part)<20000:
        corpus_categ.append(part['review_body'])
      else:
        corpus_categ.append(part[::math.floor(len(part)/20000)]['review_body'])
    count_categ_dec[df.iloc[0]['product_category']]=count_dec
    corpus_categ_dec.append(corpus_categ)
  return cat,count_categ_dec,corpus_categ_dec,corpus_review_headline    
    


def most_common_words_review_headline (categories,corpus_review_headline):
  for i in range(len(corpus_review_headline)):
    corpus_review_headline[i].reset_index(drop=True, inplace=True)
    first_word = [(corpus_review_headline[i][j].split(" ")[0])for j in range(len(corpus_review_headline[i])-1)]                                                                     
    cat = corpus_review_headline[i]
    batch_size = 3000
    batches_len = math.ceil(len(cat)/batch_size)
    category_reviews=[]
    first_word_reviews=[]
    print('Category {}'.format(categories[i]))
    # Split data into batches
    for k in range(1,batches_len):
      batch = cat[(k-1)*batch_size:k*batch_size]
      try:
        doc = nlp(' '.join(map(str, batch)))
      except ValueError:
        print("nlp function overflow")
        break

      for p in doc.ents:
        if p.text != '/><br' and p.text != '&':
          category_reviews.append(p.text)

    try:
      doc_rev_head = nlp(' '.join(map(str,first_word)))
    except ValueError:
      print("nlp function overflow")
      break 
    for p in doc_rev_head:
      if p.text != '/><br' and p.text != '&':
        first_word_reviews.append(p.text)
      
    # Most frequent words in review head (entire headline)
    temp = np.array([ele for ele in Counter(category_reviews).most_common(10)])
    temp = pd.DataFrame(temp,columns=['words','counts'])
    temp['counts']=temp['counts'].astype(int)
    review_words_plots(temp,10,categories[i],1)

    # Most frequent words in review Head (first word only)
    temp = np.array([ele for ele in Counter(first_word_reviews).most_common(10)])
    temp = pd.DataFrame(temp,columns=['words','counts'])
    temp['counts']=temp['counts'].astype(int)
    review_words_plots(temp,10,categories[i],0)


def review_words_plots(data, length, category,mode):
  plt.figure(figsize=(7,7))
  fig = sns.barplot(y=data['words'], x=data['counts'])

  plt.title("{}, {} Most Frequent Words in Review Headline".format(category,length),fontsize=16)

  plt.ylabel('')
  plt.xlabel('Count',fontsize=14)
  loc, labels = plt.xticks()
  yloc, ylabels=plt.yticks()
  plt.yticks(fontsize=12)
  plt.xticks(fontsize=12)

  if mode == 0 :
    patch = mpatches.Patch(color='#dffc03', label='1st word from headline')
  else:
    patch = mpatches.Patch(color='#dffc03', label='Entire Headline')

  plt.legend(handles=[patch])
  plt.tight_layout()
  if mode == 0 :
    plt.savefig(category+' headline 1st'+'.png',dpi=200,pad_inches=1)
  else:
    plt.savefig(category+' headline entire'+'.png',dpi=200,pad_inches=1)



def most_common_words(corpus,count):
  # For every category
  most_categ_dec = {}
  categories=list(count.keys())
  for i in range(len(corpus)):
    # if i==1:
    #   break
    temp = corpus[i]
    print('Category {}'.format(i))
    # For every data decade 
    for j,dec in enumerate(temp):
      decade = []
      dec.reset_index(drop=True, inplace=True)
      batch_size = 1000
      batches_len = math.ceil(len(dec)/batch_size)
      print('Decade {}'.format(j))

      # Split data into batches
      for k in range(1,batches_len):
        batch = dec[(k-1)*batch_size:k*batch_size]
        print('Batch {}'.format(k))
        try:
          doc = nlp(''.join(map(str, batch)))
        except ValueError:
          print("nlp function overflow")
          break

        # keep only (ORG,PERSON) brands related words
        for p in doc.ents:
          if (p.label_ == 'ORG' or p.label_ == 'PERSON') and p.text != '/><br' and p.text != '&':
            decade.append(p.text)
      if len(Counter(decade)) != 0:
        # 15 Most frequent Words (Brads related)  
        temp = np.array([ele for ele in Counter(decade).most_common(15)])
        temp = pd.DataFrame(temp,columns=['words','counts'])
        temp['counts']=temp['counts'].astype(int)
        
        # print and save plots for every category in every decade
        words_plots(temp,15,categories[i],str(j))



def words_plots(data, length, category,dec):
  plt.figure(figsize=(7,7))
  fig = sns.barplot(y=data['words'], x=data['counts'])

  plt.title("{}, {} Most Frequent Words (Brands Related)".format(category,length),fontsize=18)
  plt.ylabel('')
  plt.xlabel('Count',fontsize=20)
  loc, labels = plt.xticks()
  yloc, ylabels=plt.yticks()
  plt.yticks(fontsize=12)
  plt.xticks(fontsize=18)
  if dec == '0':
    decade = '1990s'
  elif dec == '1':
    decade = '2000s'
  elif dec == '2':
    decade = '2010s'
  patch = mpatches.Patch(color='#dffc03', label=decade)
  plt.legend(handles=[patch])
  plt.tight_layout()
  plt.savefig(category+decade+'.png',dpi=200,pad_inches=1)

temp=feature_extraction1(df_list)

temp.head()

# Use main code to add dataframes in a list
download_everything(categories)
total_df = create_df([6,12,13,14]) # Choose which columns to use

chosen_categories = total_df['product_category'].unique().tolist()
print(chosen_categories)
df_list=[]
for category in tqdm(chosen_categories, ncols=50):
  df_list.append(total_df[total_df['product_category'] == category])

 cat,count_df,corpus_df,corpus_review_headline = text_feature_extraction(df_list)

most_common_words_review_headline(cat,corpus_review_headline)

most_common_words(corpus_df,count_df)

count_df

sns.set()
sorted_list=sorted(df,key=(lambda x:df[x][2]))
x = np.arange(len(df[list(df.keys())[0]]))  # the label locations
width = 0.05  # the width of the bars

labels = list(df.keys())
fig, ax = plt.subplots()

rects0= ax.bar(x-4*width, df[sorted_list[0]], width, label=sorted_list[0])
rects1= ax.bar(x-3*width, df[sorted_list[1]], width, label=sorted_list[1])
rects2= ax.bar(x-2*width, df[sorted_list[2]], width, label=sorted_list[2])
rects3= ax.bar(x-width, df[sorted_list[3]], width, label=sorted_list[3])
rects4= ax.bar(x, df[sorted_list[4]], width, label=sorted_list[4])
rects5= ax.bar(x+width, df[sorted_list[5]], width, label=sorted_list[5])
rects6= ax.bar(x+2*width, df[sorted_list[6]], width, label=sorted_list[6])
rects7= ax.bar(x+3*width, df[sorted_list[7]], width, label=sorted_list[7])
rects8= ax.bar(x+4*width, df[sorted_list[8]], width, label=sorted_list[8])
rects9= ax.bar(x+5*width, df[sorted_list[9]], width, label=sorted_list[9])

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Reviews')
ax.set_title('Reviews Count by Decade')
ax.set_xticks(x)
ax.set_xticklabels(['1990s','2000s','2010s'])
ax.tick_params(axis='both', pad=1)
ax.legend()
plt.tight_layout()
plt.savefig('reviews_hist.png', dpi=200)

plot_review_count_hist(count_df)

df_list=create_df([1,2,6,7,8,13])

# functions for feature extraction
def feature_extraction2(df):

    # Unique reviews per person
    temp=df.groupby(['customer_id'])[['review_id']].count().copy()
    temp.columns = ['#reviews']

    # Reviews per person CDF plot
    kwargs = {'cumulative': True}
    sns.distplot(temp[1:len(temp):50]['#reviews'], hist_kws=kwargs, kde_kws=kwargs,hist=False)
    plt.xlim(0,30)
    plt.yticks((np.arange(0, 1.1, step=0.1)))
    plt.xlabel('Num of Reviews')
    plt.ylabel('% Percent')
    plt.title('CDF plot, Num of Reviews per Person')
    plt.grid(True)
    plt.savefig('CDF_reviews.png',dpi=100)

    # Unique reviewers per category
    temp=df.groupby(['product_category'])['customer_id'].nunique()
    g=sns.barplot(x=temp.index.tolist(), y=list(temp[:]))
    g.set_xticklabels(labels=temp.index.tolist(),rotation=90)
    plt.title('Unique Reviewers per Category')
    plt.tight_layout()
    plt.savefig('Unique_reviewers_percategory.png',dpi=100)
    plt.show(g)

    # GroupBy customer_id and aggregate by helpfull Votes, Voted Categories, Num of reviews, Avg Star Rating, Avg Review lenght
    df['review_lenght'] = df['review_body'].str.split(' ').str.len()

    a = df.groupby(['customer_id']).agg(
    { 'product_category': pd.Series.unique,
      'review_id': 'count',
      'helpful_votes': 'sum',
      'star_rating' : 'mean',
      'review_lenght': 'mean'})
    a.columns=['Product_Categories','Num_Of_Reviews','Total_Helpful_Votes','Avg_Star_Rating','Avg_Words_Per_Review']
    del df
    a = a.sort_values(by='Total_Helpful_Votes',ascending=False)
    # Order By helpful Votes
    return a

temp=feature_extraction2(df_list)
temp

# Correlation Matrix

def cor_matrix(df):
  # Generate a mask for the upper triangle
  
  cor = df.corr(method ='pearson') 
  mask = np.triu(np.ones_like(cor, dtype=np.bool))
  f, ax = plt.subplots(figsize=(15, 12))
  cmap =  sns.diverging_palette(220, 20, sep=20, as_cmap=True)
  sns.heatmap(cor , mask=mask, cmap=cmap, center=0, annot=True, fmt='0.3f',square=True, linewidths=1, cbar_kws={'shrink': .7})
  plt.savefig('cor_matrix.png')
  plt.show()

cor_matrix(temp[['Num_Of_Reviews','Total_Helpful_Votes','Avg_Star_Rating','Avg_Words_Per_Review']])

# Choose the columns you need as a parameter
total_df = create_df([1,2,6,7,8,9,13,14])
# Info about df, column names/types
print('Info:')
print(total_df.info())
print('Shape: ',total_df.shape)
# Check duplicate rows
print('duplicate rows',len(total_df[total_df['review_id'].duplicated()]))

# Average number of ratings per user
new_df = total_df.groupby('customer_id', as_index = False)['review_id'].agg(['count'])
print('Average number of reviews per user',new_df['count'].mean())

100%|███████████████████████████████████████████| 10/10 [07:51<00:00, 39.85s/it]

Info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 39434657 entries, 0 to 1741043
Data columns (total 8 columns):
customer_id         uint32
review_id           object
product_category    object
star_rating         uint8
helpful_votes       uint16
total_votes         uint16
review_body         object
review_date         datetime64[ns]
dtypes: datetime64[ns](1), object(3), uint16(2), uint32(1), uint8(1)
memory usage: 1.8+ GB
None
Shape:  (39434657, 8)
duplicate rows 0
Average number of reviews per user 2.8093271326939346

# Make correlation plot between features
corr = total_df.corr()
corr
#sns.heatmap(corr, annot = True)

# PLOT: average star rating per product category
def average_rating_per_category(df):
  new_df = df.groupby('product_category', as_index = False)['star_rating'].mean().sort_values(by = 'star_rating', ascending = True)
  sns.set(style="whitegrid")
  ax = plt.subplots(figsize=(18,5))
  ax = sns.barplot(x="product_category", y="star_rating", data=new_df)
  ax.set(xlabel='Product category', ylabel='Average star rating')  
  plt.title('Average star rating per product category')
  plt.tight_layout()
  plt.show()


# PLOTS: 
# - average star rating per year
# - number of ratings per year
def average_rating_and_no_of_ratings_per_year(df):
  df['year'] = df['review_date'].dt.year
  df['year'] = pd.to_datetime(df['year'], format='%Y')
  new_df = df.groupby('year', as_index = False)['star_rating'].agg(['mean', 'count']).sort_values(by = 'year', ascending = True)
  count = new_df['count']
  mean = new_df['mean']
  years = new_df.index

  sns.set(style="whitegrid")
  plt.figure(figsize=(15,5))

  plt.subplot(1, 2, 1)
  ax = sns.lineplot(x=years, y=mean, data=new_df, marker="o")
  ax.set(xlabel='Years', ylabel='Avarage Star Rating')  
  plt.title('Avg Rating vs Years', color='red')
  plt.subplot(1, 2, 2)
  ax = sns.lineplot(x=years, y=count, data=new_df, marker="o")
  ax.set(xlabel='Years', ylabel='Count of Ratings')  
  plt.title('Count of Ratings vs Years',color='green')

  plt.tight_layout()
  plt.show()
  df = df.drop(columns=['year'])


# PLOTS: 
# - distribution of star ratings
# - average length of reviews vs star ratings
def distr_of_star_ratings_and_avg_length_of_reviews_vs_star_ratings(df):
  plot1_df = df.groupby('star_rating', as_index = False).size().reset_index(name='counts')
  df['len_of_review'] = df['review_body'].str.len()
  plot2_df = df.groupby('star_rating', as_index = False)['len_of_review'].mean()

  plt.figure(figsize=(15,5))

  plt.subplot(1, 2, 1)
  ax = sns.barplot(x=plot1_df['star_rating'], y=plot1_df['counts'])
  ax.set(xlabel='Rating', ylabel='No of reviews with this rating')  
  plt.title('Distribution of star ratings')
  plt.subplot(1, 2, 2)
  ax = sns.barplot(x=plot2_df['star_rating'], y=plot2_df['len_of_review'])
  ax.set(xlabel='Rating', ylabel='Avg length of reviews')  
  plt.title('Avg lenth of reviews vs star ratings')

  plt.tight_layout()
  plt.show()
  df = df.drop(columns=['len_of_review'])

average_rating_per_category(total_df)

average_rating_and_no_of_ratings_per_year(total_df)

/usr/local/lib/python3.6/dist-packages/pandas/plotting/_matplotlib/converter.py:103: FutureWarning: Using an implicitly registered datetime converter for a matplotlib plotting method. The converter was registered by pandas on import. Future versions of pandas will require you to explicitly register matplotlib converters.

To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()
  warnings.warn(msg, FutureWarning)

distr_of_star_ratings_and_avg_length_of_reviews_vs_star_ratings(total_df)

# Clear RAM
del total_df
gc.collect()

14432

def verified_count(df):
  # Returns: product_category, star_rating, verified_purchase, verified_purchase_counts
  return df.groupby(['product_category',
                     'star_rating',
                     'verified_purchase']
                    ).agg(counts=('verified_purchase','count')
                    ).reset_index()


def monthly_purchase(df):
  # Returns: Monthly purchases sorted by year for given category
  mp = pd.crosstab(df.review_date.dt.year, df.review_date.dt.strftime('%b'),
                   df.product_category, aggfunc='count',
                   rownames=['Year'], colnames=['Month'])

  # Replace NaN with 0s.
  mp.fillna(0, inplace=True)

  # Arrange by monrh
  mp = mp[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']]

  # Sort by Year
  mp.sort_values(by='Year', inplace=True)

  return mp


def products_reviews_count(df):

  data = {
      # 'marketplace': [df.marketplace.values[0]],
      'product_category': [df.product_category.values[0]],
      'total_products': [df.product_id.nunique()],
      'total_reviews': [df.shape[0]]
    }

  return pd.DataFrame(data=data)


def count_unique_customer(df):

  uq = df[df.verified_purchase == 'Y'].customer_id.value_counts().reset_index()
  uq.rename(columns={'customer_id': 'total_reviews', 'index': 'customer_id'}, inplace=True)

  uq['product_category'] = df.product_category.values[0]

  return uq

# Which columns to load, using less is faster.
usecols = [0, 1, 3, 6, 7, 11, 14]

# total count for verified or not reviews (1-5) stars
verified_counts = []

# monthly purchases for all categories sorted by year
monthly_purchases = {}

# Total products/reviews counts per category
products_reviews_counts = []

# count unique customers per category
count_unique_customers = []

# files in 'data/'
files = [file for file in glob.glob('data/*.gz')]

# Iterate over all categories
for file in tqdm(files, ncols=80):

  # Category name, remove 'data/us_' and '_v1_xx.tsv.gz'
  category = file[8:-13]

  # Load Data
  df = request_dataframe(file, usecols=usecols)

  # Process
  verified_counts.append(verified_count(df))
  products_reviews_counts.append(products_reviews_count(df))
  monthly_purchases[category] = monthly_purchase(df)
  count_unique_customers.append(count_unique_customer(df))

100%|███████████████████████████████████████████| 10/10 [10:55<00:00, 55.75s/it]

# Concat them to get one final Dataframe
count_unique_customers = pd.concat(count_unique_customers).reset_index(drop=True)
verified_counts = pd.concat(verified_counts).reset_index(drop=True)
products_reviews_counts = pd.concat(products_reviews_counts).reset_index(drop=True)

# Sort them for better visuals
count_unique_customers = count_unique_customers.sort_values(by='total_reviews', ascending=False)
verified_counts = verified_counts.sort_values(by='counts', ascending=False)
products_reviews_counts = products_reviews_counts.sort_values(by='total_reviews', ascending=False)

count_unique_customers.head()

verified_counts.head()

products_reviews_counts.head()

monthly_purchases['PC'].head(100)

def plot_trend(crosstab):

  flattened = pd.DataFrame(crosstab.to_records())

  flattened['Spring'] = flattened.Mar + flattened.Apr + flattened.May
  flattened['Summer'] = flattened.Jun + flattened.Jul + flattened.Aug
  flattened['Autumn'] = flattened.Sep + flattened.Oct + flattened.Nov
  flattened['Winter'] = flattened.Dec + flattened.Jan + flattened.Feb

  flattened.set_index('Year', drop=True, inplace=True)
  
  flattened.drop(columns=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], inplace=True)

  ax = flattened.plot(xticks=flattened.index, figsize=(12, 8))
  plt.title(f'{category}: Total Reviews vs Years', color='red')

for key, val in monthly_purchases.items():
  plot_trend(monthly_purchases[key])


plt.tight_layout()
plt.show()

# Convert to pivot table, where each category is a column and holds total reviews given by each customer.
customer_reviews_per_category = count_unique_customers.pivot(index='customer_id', columns='product_category', values='total_reviews')
customer_reviews_per_category.fillna(0, inplace=True)
customer_reviews_per_category.head()

customer_reviews_corr = customer_reviews_per_category.iloc[:, 1:].corr()

customer_reviews_corr

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(customer_reviews_corr, dtype=np.bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(15, 12))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True, l=40)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(customer_reviews_corr, mask=mask, cmap=cmap, center=0, annot=True, fmt='0.3f',
            square=True, linewidths=1, cbar_kws={'shrink': .7})

plt.show()

ax = products_reviews_counts.plot(kind='bar', figsize=(15, 8))
#ax = sns.barplot(x="product_category", y="total_reviews", hue="total_reviews", data=products_reviews_counts)
ax.set_xticklabels(products_reviews_counts.product_category)
plt.tight_layout()
plt.show()

products_reviews_counts['ratio'] = products_reviews_counts.total_reviews/products_reviews_counts.total_products
print(products_reviews_counts)

         product_category  total_products  total_reviews      ratio
6                      PC          441955        6908283  15.631191
3                 Apparel         2305407        5905481   2.561578
4  Health & Personal Care          489603        5331217  10.888857
5                  Beauty          588782        5115231   8.687818
0                    Toys          664017        4863800   7.324812
8                   Shoes         1900926        4366449   2.297012
1             Electronics          185844        3093746  16.647005
2                 Jewelry          574036        1767480   3.079040
9                   Tools          183801        1741044   9.472440
7                Software           28736         341926  11.898872

df = create_df(usecols=[1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14])
df['year'] = df.review_date.dt.year

100%|███████████████████████████████████████████| 10/10 [07:03<00:00, 36.40s/it]

_, ax = plt.subplots(nrows=2, ncols=1, figsize=(10, 20))
colors = list(mcolors.TABLEAU_COLORS.values())[:10]

df_ = df[df.verified_purchase == 'Y'].copy()
df_ = df_.groupby('product_category').size().reset_index(name='counts').sort_values(by='product_category')
df_.loc[:, 'colors'] = colors

df_.plot(ax=ax[0], kind='pie', y='counts', labels=df_.product_category, legend=False, autopct='%1.f%%', colors=df_.colors)
ax[0].set_title('Verified Purchases')
ax[0].set_ylabel('')

df_ = df[df.verified_purchase == 'N'].copy()
df_ = df_.groupby('product_category').size().reset_index(name='counts').sort_values(by='product_category')
df_.loc[:, 'colors'] = colors

df_.plot(ax=ax[1], kind='pie', y='counts', labels=df_.product_category, legend=False, autopct='%1.f%%', colors=df_.colors)
ax[1].set_title('Non Verified Purchases')
ax[1].set_ylabel('')

plt.tight_layout()
plt.show()

# Average start, total reviews per year
df.groupby(['year']).agg({'star_rating': [('Mean Stars', 'mean'), ('Total Reviews', 'count')]})

# Top 5 most helpful reviews
df.sort_values(by='helpful_votes', ascending=False).head(5)

# Customers who reviewed same products
df.groupby(['customer_id', 'product_id']).size().reset_index(name='counts').sort_values(by='counts', ascending=False).head(10)

# Clear RAM
del df, customer_reviews_per_category
gc.collect()

74464

# Import libraries
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

from sklearn import tree

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import sys, os, re, csv, codecs
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.

Using TensorFlow backend.

# PREPROCESSING

# Choose the columns you need as a parameter
total_df = create_df([7,12,13])
no_of_rows = 100000
# Select randomly 100000 rows from the df
my_df = pd.DataFrame({})
for i in total_df['star_rating'].unique():
  my_df = my_df.append(total_df[total_df['star_rating'] == i].sample(n = int(no_of_rows/5), replace = False), ignore_index=True)

# Keep only useful columns
my_df = my_df[['review_body','review_headline','star_rating']]
my_df['review'] = my_df[['review_body', 'review_headline']].agg(' '.join, axis=1)
# Create column for classification
my_df['classification'] = my_df['star_rating'].apply(lambda x: 1 if x>2.5 else 0) # 0 if x<3 else 1 if x==3 else 2)
y = my_df['classification']
X = my_df.loc[:, my_df.columns != 'classification']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print('Train set:',x_train.shape)
print('Test set:',x_test.shape)

# Clean data
replace_symbols_by_space = re.compile('[/(){}\[\]\|@,;]')
eng_stopwords = set(stopwords.words('english'))

def clean_text(text):
  # Lowercase text
  text = text.lower()
  # Replace symbols by space in text
  text = replace_symbols_by_space.sub(' ', text)
  # Delete stopwors from text
  text = ' '.join(word for word in text.split() if word not in eng_stopwords)
  return text

x_train["review"] = x_train["review"].apply(clean_text)
x_test["review"] = x_test["review"].apply(clean_text)

# Clear RAM
del total_df,my_df
gc.collect()

100%|███████████████████████████████████████████| 10/10 [07:27<00:00, 38.15s/it]

Train set: (67000, 4)
Test set: (33000, 4)

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

7

# Decision Tree Classifier 

train_text = x_train["review"]
test_text = x_test["review"]
all_text = pd.concat([train_text, test_text])

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 2),
    max_features=20000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(2, 6),
    max_features=30000) 
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features]) 

train_target = y_train
test_target = y_test
#Define the model
tree_model = tree.DecisionTreeClassifier()
# Train the model
tree_model.fit(X = train_features, y = train_target)
# Get probability
preds = tree_model.predict_proba(X = test_features)[:, 1]
#print(np.mean(preds))
#print(tree_model.score(X = train_features, y = train_target))
print('Accuracy',tree_model.score(X = test_features, y = test_target))

Accuracy 0.7376969696969697

# Logistic Regression

train_features = hstack([train_char_features, train_word_features]) 
test_features = hstack([test_char_features, test_word_features])

train_target = y_train
#Define the model
classifier = LogisticRegression(C=0.1, solver='sag')
cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=5, scoring='accuracy')) 
print('CV score for class {} is {}'.format(y_train.name, cv_score))

classifier.fit(train_features, train_target)
print('Accuracy:',classifier.score(test_features,y_test))

CV score for class classification is 0.8419701492537314
Accuracy: 0.8443333333333334

# Linear SVM with SGD training

train_features = hstack([train_char_features, train_word_features]) 
test_features = hstack([test_char_features, test_word_features]) 

#Define the model
clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=10, tol=None)
# Training the model
# Fitting the training data to the model
clf.fit(train_features,y_train)
# Making predictions with the model
svm_prediction=clf.predict(test_features)
# Evaluating model performance
ctr=0
score=0
misclassified=[]
for i,j in zip(y_test,svm_prediction):
    if i==j:
        score+=1
    ctr+=1
print ("Score :",score,"/",len(y_test))

print(accuracy_score(y_test, svm_prediction))
#print(np.mean(svm_prediction == y_test))

print(f1_score(y_test, svm_prediction, average='macro'))
print(f1_score(y_test, svm_prediction, average='micro'))
print(f1_score(y_test, svm_prediction, average='weighted'))
# Print classification report
print(classification_report(y_test, svm_prediction))
# Compute confusion matrix
cnf_mtx = confusion_matrix(y_test, svm_prediction)
print(cnf_mtx)

Score : 27317 / 33000
0.8277878787878787
0.8124644714701685
0.8277878787878787
0.8230103805453912
              precision    recall  f1-score   support

           0       0.87      0.67      0.76     13254
           1       0.81      0.93      0.87     19746

    accuracy                           0.83     33000
   macro avg       0.84      0.80      0.81     33000
weighted avg       0.83      0.83      0.82     33000

[[ 8942  4312]
 [ 1371 18375]]

# LSTM 

train_text = x_train["review"]
test_text = x_test["review"]

max_features = 30000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_text))
list_tokenized_train = tokenizer.texts_to_sequences(train_text)
list_tokenized_test = tokenizer.texts_to_sequences(test_text)

total_num_words = [len(review) for review in list_tokenized_train]
plt.hist(total_num_words,bins = np.arange(0,410,10))
plt.show()
print('Mean length of reviews (body+headline):',sum(total_num_words)/len(total_num_words),'words')
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

inp = Input(shape=(maxlen, )) 
embed_size = 128
x = Embedding(max_features, embed_size)(inp)
x = LSTM(10, return_sequences=True,name='lstm_layer')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x) 
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
#Define the model
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
print(model.summary())
batch_size = 128
epochs = 2 
lstm_model = model.fit(X_t,y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)
score, acc = model.evaluate(X_te, y_test,
                            batch_size=batch_size, verbose=1)
#print('Test score:', score)
print('Test accuracy:', acc)

Mean length of reviews (body+headline): 31.903253731343284 words
Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_2 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 200, 128)          3840000   
_________________________________________________________________
lstm_layer (LSTM)            (None, 200, 10)           5560      
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 10)                0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 50)                550       
_________________________________________________________________
dropout_4 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 51        
=================================================================
Total params: 3,846,161
Trainable params: 3,846,161
Non-trainable params: 0
_________________________________________________________________
None
Train on 60300 samples, validate on 6700 samples
Epoch 1/2
60300/60300 [==============================] - 99s 2ms/step - loss: 0.4407 - acc: 0.7824 - val_loss: 0.3648 - val_acc: 0.8303
Epoch 2/2
60300/60300 [==============================] - 101s 2ms/step - loss: 0.3147 - acc: 0.8654 - val_loss: 0.3642 - val_acc: 0.8370
33000/33000 [==============================] - 9s 276us/step
Test accuracy: 0.8357878787734292

# Plots of loss and accuracy
plt.clf()
loss = lstm_model.history['loss']
val_loss = lstm_model.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'g', label='Training loss')
plt.plot(epochs, val_loss, 'y', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.clf()
acc = lstm_model.history['acc']
val_acc = lstm_model.history['val_acc']
plt.plot(epochs, acc, 'g', label='Training acc')
plt.plot(epochs, val_acc, 'y', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Load PC Data
df = request_dataframe('data/us_PC_v1_00.tsv.gz')
df['year'] = df.review_date.dt.year

# Keep year 2013
df = df[df.year == 2013]

# Convert to string
df.product_parent = df.product_parent.astype(str)

joined_df = df.groupby(['customer_id', 'year'])['product_parent'].apply(','.join)
joined_df = joined_df.reset_index()
joined_df.head()

# Keep combinations of length > 1
joined_df = joined_df[joined_df.product_parent.str.contains(',')]
joined_df

# Grab pairs
joined_df['product_parent'] = joined_df.product_parent.str.split(',')
joined_df = joined_df.product_parent.tolist()

# Throtle to 80k
joined_df = joined_df[:80000]

# 1hot encoding, use sparse to save ram
te = TransactionEncoder()
te_ary = te.fit(joined_df).transform(joined_df, sparse=True)
sparse_df = pd.DataFrame.sparse.from_spmatrix(te_ary, columns=te.columns_)
sparse_df.shape

(80000, 48505)

freq_items = apriori(sparse_df, min_support=0.01, use_colnames=True)
freq_items

rules = association_rules(freq_items, min_threshold=0.01)
rules

	customer_id	star_rating	helpful_votes	total_votes
customer_id	1.000000	-0.012626	0.016825	0.018244
star_rating	-0.012626	1.000000	-0.015182	-0.028144
helpful_votes	0.016825	-0.015182	1.000000	0.995134
total_votes	0.018244	-0.028144	0.995134	1.000000

	customer_id	total_reviews	product_category
0	16404530	749	Toys
10604929	4808156	503	Beauty
1	48987445	413	Toys
4009599	33315159	366	Jewelry
2	10526256	306	Toys

Month	Jan	Feb	Mar	Apr	May	Jun	Jul	Aug	Sep	Oct	Nov	Dec
Year
1999	0.0	0.0	0.0	0.0	0.0	0.0	21.0	26.0	31.0	44.0	76.0	186.0
2000	159.0	124.0	145.0	160.0	251.0	457.0	369.0	334.0	250.0	329.0	474.0	544.0
2001	499.0	405.0	454.0	364.0	428.0	596.0	592.0	609.0	569.0	568.0	622.0	882.0
2002	842.0	695.0	816.0	719.0	648.0	565.0	802.0	877.0	830.0	1019.0	999.0	1313.0
2003	1420.0	1114.0	1104.0	1000.0	1061.0	1118.0	1010.0	1138.0	1108.0	1091.0	1079.0	1388.0
2004	1548.0	1214.0	1308.0	1172.0	1100.0	1137.0	1103.0	947.0	1026.0	1009.0	1228.0	1333.0
2005	1438.0	1197.0	1348.0	1049.0	1000.0	1063.0	1408.0	2328.0	2507.0	1948.0	1178.0	1707.0
2006	2221.0	2766.0	3597.0	730.0	1358.0	1572.0	2305.0	2857.0	1453.0	1420.0	3783.0	2215.0
2007	8595.0	4004.0	5733.0	3103.0	5682.0	3656.0	4239.0	4424.0	4697.0	4380.0	5208.0	6164.0
2008	6934.0	6275.0	5958.0	6203.0	6278.0	6109.0	6357.0	6677.0	6748.0	7092.0	7633.0	9145.0
2009	11694.0	10379.0	11595.0	10963.0	9590.0	9785.0	8604.0	9266.0	11750.0	10676.0	11209.0	14330.0
2010	16047.0	17649.0	15553.0	13201.0	14080.0	13636.0	16311.0	17331.0	20313.0	21707.0	20921.0	26422.0
2011	33499.0	27184.0	28291.0	26805.0	25963.0	27650.0	29752.0	32517.0	35086.0	33891.0	40156.0	56401.0
2012	62918.0	45858.0	41694.0	39041.0	38754.0	39199.0	42531.0	44737.0	47206.0	51701.0	69290.0	138913.0
2013	165962.0	123849.0	121770.0	106731.0	105127.0	99173.0	105328.0	113254.0	99815.0	106935.0	101114.0	147897.0
2014	192501.0	141363.0	140257.0	119098.0	112671.0	112499.0	181464.0	189741.0	183084.0	185491.0	181382.0	257744.0
2015	301039.0	254032.0	255994.0	212251.0	203362.0	202331.0	220521.0	228263.0	0.0	0.0	0.0	0.0

product_category	Beauty	Electronics	Health & Personal Care	Jewelry	PC	Shoes	Software	Tools	Toys
product_category
Beauty	1.000000	0.102029	0.426728	0.174999	0.107389	0.217230	0.038936	0.074667	0.126799
Electronics	0.102029	1.000000	0.181270	0.034113	0.412488	0.063146	0.119665	0.227636	0.086419
Health & Personal Care	0.426728	0.181270	1.000000	0.123782	0.177997	0.175334	0.077363	0.153455	0.151993
Jewelry	0.174999	0.034113	0.123782	1.000000	0.041739	0.156160	0.013896	0.026723	0.092003
PC	0.107389	0.412488	0.177997	0.041739	1.000000	0.066336	0.159489	0.207519	0.093672
Shoes	0.217230	0.063146	0.175334	0.156160	0.066336	1.000000	0.018754	0.054035	0.110683
Software	0.038936	0.119665	0.077363	0.013896	0.159489	0.018754	1.000000	0.077924	0.035121
Tools	0.074667	0.227636	0.153455	0.026723	0.207519	0.054035	0.077924	1.000000	0.072447
Toys	0.126799	0.086419	0.151993	0.092003	0.093672	0.110683	0.035121	0.072447	1.000000

	star_rating
	Mean Stars	Total Reviews
year
1997	4.000000	1
1998	4.255814	43
1999	4.258635	3619
2000	4.051943	17500
2001	3.876644	26760
2002	3.869913	43325
2003	3.753678	60693
2004	3.701050	76568
2005	3.744062	109726
2006	3.802562	140910
2007	4.020054	328416
2008	4.032114	437217
2009	4.007070	603661
2010	3.986727	893442
2011	4.007989	1556706
2012	4.067250	2805169
2013	4.136746	7024592
2014	4.169426	12420086
2015	4.203042	12886223

Amazon Customer Reviews Dataset¶

Main functions to download dataset¶

Reviews, Text Mining¶

Feature Extraction¶

Exploratory Data Analysis¶

General info¶

Plots and statistics¶

Machine Learning - Sentiment Analysis¶

Preprocessing¶

Algorithms¶

Itemsets¶

	product_category	star_rating	verified_purchase	counts
69	PC	5	Y	3680931
39	Apparel	5	Y	2995966
49	Health & Personal Care	5	Y	2805289
59	Beauty	5	Y	2711391
9	Toys	5	Y	2620290

	product_category	total_products	total_reviews
6	PC	441955	6908283
3	Apparel	2305407	5905481
4	Health & Personal Care	489603	5331217
5	Beauty	588782	5115231
0	Toys	664017	4863800

product_category	Apparel	Beauty	Electronics	Health & Personal Care	Jewelry	PC	Shoes	Software	Tools	Toys
customer_id
10003	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
10005	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
10007	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0
10009	0.0	7.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
10016	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0

	customer_id	review_id	product_id	product_parent	product_title	product_category	star_rating	helpful_votes	total_votes	verified_purchase	review_headline	review_date	year
6762840	52994074	R3R24QH3CDS83N	B000FI73MA	796637267	Kindle: Amazon's Original Wireless Reading Dev...	PC	5	47524	48362	Y	Why and how the Kindle changes everything	2007-11-25	2007
5633035	51121444	R2XKMDXZHQ26YX	B000OE2OLU	771086291	The Mountain Kids 100% Cotton Three Wolf Moon ...	Apparel	5	41278	41889	N	Dual Function Design	2008-11-11	2008
6422658	53025525	R2YVZNKUMWGYJ4	B003FSUDM4	57096051	Kindle Keyboard 3G, Free 3G + Wi-Fi, 6" E Ink ...	PC	5	31924	32373	Y	Kindle vs. Nook (updated)	2010-08-28	2010
5552581	12267963	R1PL1U31XQG8KG	B0083PWAPW	270935683	Kindle Fire HD 7", Dolby Audio, Dual-Band Wi-Fi	PC	4	31417	32166	Y	You Get What You Pay For	2012-09-17	2012
6014420	51835976	RBUNL9QWDYTD7	B0051VVOB2	341475382	Kindle Fire (Previous Generation - 1st)	PC	4	28611	29433	Y	A great device WHEN you consider price and fun...	2011-11-14	2011

	customer_id	product_id	counts
29653524	41815516	B00004Y7CF	11
37092221	50848121	B00004Y3QV	10
39195955	52935813	B00006F7S2	10
38327249	52129858	B00000JNF4	10
22980626	30839367	B00008VFCU	10
29723786	41903771	B00006JHWQ	9
19703014	25398565	B00009P7I2	9
28971184	40828678	B00005QXWI	7
14018713	17633439	B0001RH9WY	7
28893797	40682067	B0000899ZP	7

	customer_id	year	product_parent
0	10052	2013	959287788
1	10067	2013	603357256
2	10080	2013	660150950
3	10136	2013	612062836
4	10146	2013	944948962

	customer_id	year	product_parent
12	10367	2013	978870549,554437936,651146058
16	10563	2013	584577108,56368570
24	11049	2013	810718557,571829280,336891384,631954520,470945...
26	11184	2013	462889525,200035237,603357256
31	11359	2013	839982486,398989010
...	...	...	...
971658	53096052	2013	185327228,964375521
971663	53096270	2013	949829144,650780817
971666	53096363	2013	337108060,374962352,397677674,702192632,827811...
971669	53096402	2013	692974962,631985698,23779211
971671	53096454	2013	810961880,85987590

	support	itemsets
0	0.024588	(185327228)
1	0.028012	(270935683)
2	0.016313	(391328525)
3	0.010013	(483747720)
4	0.016688	(492347851)
5	0.010462	(613424701)
6	0.026875	(725342975)
7	0.015738	(964375521)
8	0.010900	(185327228, 964375521)

	antecedents	consequents	antecedent support	consequent support	support	confidence	lift	leverage	conviction
0	(185327228)	(964375521)	0.024588	0.015738	0.0109	0.443315	28.169321	0.010513	1.768077
1	(964375521)	(185327228)	0.015738	0.024588	0.0109	0.692613	28.169321	0.010513	3.173241