Daily Matrix Factorization Update¶

In [ ]:
import functions_framework
from google.cloud import bigquery, storage
import pandas as pd
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import numpy as np

@functions_framework.http
def main(request):
    # Initialize a client
    client = bigquery.Client()
    storage_client = storage.Client()
    
    # Specify the query
    query = """
    SELECT user_id, package_name, LOG(SUM(foreground_time_ms)) as weight
    FROM `data604-project-g3.Footprint_data.app_usage`
    GROUP BY user_id, package_name
    ORDER BY user_id
    """

    # Make the query
    df = client.query(query).to_dataframe()
    
    # Prepare the user-item matrix
    user_item_matrix = df.pivot(index='user_id', columns='package_name', values='weight').fillna(0)

    # Calculate user-user similarity using cosine similarity
    similarity_matrix = cosine_similarity(user_item_matrix, user_item_matrix)

    similarity_dict = {}
    # Find the most similar user to the target user
    for target_user_id in user_item_matrix.index:
        target_user_idx = user_item_matrix.index.get_loc(target_user_id)
        similar_user_idx = similarity_matrix[target_user_idx].argsort()[-2]
        most_similar_user_id = user_item_matrix.index[similar_user_idx]
        similarity_dict[target_user_id] = most_similar_user_id
    
    with storage_client.bucket('data604-project-g3-data-free').blob(f"ml/collaborative_filtering/explain.pkl").open("wb") as file:
        pickle.dump(similarity_dict, file)
    with storage_client.bucket('data604-project-g3-data-free').blob(f"ml/collaborative_filtering/data.pkl").open("wb") as file:
        pickle.dump(df, file)
            
    model = NMF(n_components=5)
    A = model.fit_transform(user_item_matrix)
    B = model.components_
    user_item_matrix_estimated = np.dot(A, B)
    
    with storage_client.bucket('data604-project-g3-data-free').blob(f"ml/collaborative_filtering/user_item_matrix.pkl").open("wb") as file:
        pickle.dump(user_item_matrix_estimated, file)
    with storage_client.bucket('data604-project-g3-public-free').blob(f"rec_users.txt").open("w") as file:
        for user in user_item_matrix.index.tolist():
            file.write(f"{user}\n")
    with storage_client.bucket('data604-project-g3-public-free').blob(f"rec_apps.txt").open("w") as file:
        for app in user_item_matrix.columns.tolist():
            file.write(f"{app}\n")
            
    return {"status": "Success"}

Get Top 5 New Apps Recommendation by User¶

In [ ]:
import functions_framework
import pandas as pd
import pickle
from google.cloud import bigquery, storage
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_similarity

def get_arg(arg_name, request_json, request_args):
    if request_json and arg_name in request_json:
        val = request_json[arg_name]
    elif request_args and arg_name in request_args:
        val = request_args[arg_name]
    else:
        val = None
    return val

@functions_framework.http
def main(request):
    request_json = request.get_json(silent=True)
    request_args = request.args
    
    storage_client = storage.Client()
    # Make the query
    with storage_client.bucket('data604-project-g3-data-free').blob("ml/collaborative_filtering/data.pkl").open("rb") as file:
        df = pickle.load(file)
    with storage_client.bucket('data604-project-g3-data-free').blob("ml/collaborative_filtering/explain.pkl").open("rb") as file:
        similarity_dict = pickle.load(file)
    with storage_client.bucket('data604-project-g3-data-free').blob("ml/collaborative_filtering/user_item_matrix.pkl").open("rb") as file:
        user_item_matrix = pickle.load(file)
    with storage_client.bucket('data604-project-g3-public-free').blob("rec_users.txt").open("r") as file:
        users = [row for row in file.read().split("\n") if len(row)]
    with storage_client.bucket('data604-project-g3-public-free').blob("rec_apps.txt").open("r") as file:
        apps = [row for row in file.read().split("\n") if len(row)]

    # Get recommendations for each user
    user_id = get_arg('user_id', request_json, request_args)
    apps_seen = df[df['user_id'] == user_id]['package_name'].tolist()
    
    unseen_apps = [app for app in apps if app not in apps_seen]

    # Find the most similar user to the target user
    most_similar_user_id = similarity_dict.get(user_id)

    # Get recommendations for the user
    if user_id in users:
        recommendations = user_item_matrix[users.index(user_id)]
        score_list = []
        for app in unseen_apps:
            score_list.append([app, recommendations[apps.index(app)]])
        top_n = sorted(score_list, key = lambda x:-x[1])
    else:
        top_n = []

    if request.method == 'OPTIONS':
        # Allows GET requests from any origin with the Content-Type
        # header and caches preflight response for an 3600s
        headers = {
            'Access-Control-Allow-Origin': '*',
            'Access-Control-Allow-Methods': 'GET',
            'Access-Control-Allow-Headers': 'Content-Type',
            'Access-Control-Max-Age': '3600'
        }

        return ('', 204, headers)

    # Set CORS headers for the main request
    headers = {
        'Access-Control-Allow-Origin': '*'
    }

    return (json.dumps({"rec": top_n[:5], "explain": most_similar_user_id}), 200, headers)