csv to upload to S3 (#9486)

* csv to upload to S3 * chore: wire in bsci script + split functions + add todo * chore: wire in S3 fetch * chore: configure upload/download to S3 * bug fixes * add migrations * uncomment S3 upload * Uploads csv file to s3 * Reads from s3 Co-authored-by: Graham Dixon <[email protected]>
gitcoinco · Sep 22, 2021 · f6f9f0d · f6f9f0d
1 parent 98cc666
commit f6f9f0d
Show file tree

Hide file tree

Showing 9 changed files with 254 additions and 44 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -16,8 +16,8 @@ RUN apt-get update
 
 # Install general dependencies.
 RUN apt-get install -y $PACKAGES
-RUN apt-get update
-RUN apt-get install -y $BUILD_DEPS
+RUN apt-get update --fix-missing
+RUN apt-get install -y $BUILD_DEPS --fix-missing
 
 RUN apt-get install -y wget
 RUN apt-get install -y libsodium-dev

diff --git a/app/app/settings.py b/app/app/settings.py
@@ -771,6 +771,8 @@ def callback(request):
 S3_REPORT_BUCKET = env('S3_REPORT_BUCKET', default='')  # TODO
 S3_REPORT_PREFIX = env('S3_REPORT_PREFIX', default='')  # TODO
 
+S3_BSCI_SYBIL_BUCKET = env('S3_REPORT_BUCKET', default='')  # TODO
+
 INSTALLED_APPS += env.list('DEBUG_APPS', default=[])
 
 

diff --git a/app/grants/tasks.py b/app/grants/tasks.py
@@ -2,21 +2,24 @@
 import math
 import time
 from decimal import Decimal
+from io import StringIO
 
 from django.conf import settings
 from django.utils import timezone
 from django.utils.text import slugify
 
+import boto3
 from app.services import RedisService
 from celery import app
 from celery.utils.log import get_task_logger
 from dashboard.models import Profile
 from grants.models import Grant, GrantCLR, GrantCollection, Subscription
-from grants.utils import get_clr_rounds_metadata, save_grant_to_notion
+from grants.utils import bsci_script, get_clr_rounds_metadata, save_grant_to_notion
 from marketing.mails import (
     new_contributions, new_grant, new_grant_admin, notion_failure_email, thank_you_for_supporting,
 )
-from townsquare.models import Comment
+from perftools.models import StaticJsonEnv
+from townsquare.models import Comment, SquelchProfile
 from unidecode import unidecode
 
 logger = get_task_logger(__name__)
@@ -421,3 +424,23 @@ def generate_collection_cache(self, collection_id):
         collection.generate_cache()
     except Exception as e:
         print(e)
+
+
+@app.shared_task(bind=True, max_retries=3)
+def process_bsci_sybil_csv(self, file_name, csv):
+    '''fetch csv from bsci and toggle'''
+
+    if not file_name:
+        bsciJSON = StaticJsonEnv.objects.get(key='BSCI_SYBIL_TOKEN')
+        data = bsciJSON.data
+        file_name = data['csv_url']
+
+    if not csv:
+        client = boto3.client('s3', aws_access_key_id=settings.AWS_ACCESS_KEY_ID, aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY)
+        csv_object = client.get_object(Bucket=settings.S3_BSCI_SYBIL_BUCKET, Key=file_name)
+        csv = csv_object['Body']
+
+    csv = StringIO(csv.read().decode('utf-8'))
+
+    # run bsci script
+    bsci_script(csv)
diff --git a/app/grants/urls.py b/app/grants/urls.py
@@ -20,8 +20,8 @@
 from django.urls import path, re_path
 
 from grants.views import (
-    add_grant_from_collection, bulk_fund, bulk_grants_for_cart, cancel_grant_v1, cart_thumbnail, clr_grants, collage,
-    collection_thumbnail, contribute_to_grants_v1, contribution_addr_from_all_as_json,
+    add_grant_from_collection, api_toggle_user_sybil, bulk_fund, bulk_grants_for_cart, cancel_grant_v1, cart_thumbnail,
+    clr_grants, collage, collection_thumbnail, contribute_to_grants_v1, contribution_addr_from_all_as_json,
     contribution_addr_from_grant_as_json, contribution_addr_from_grant_during_round_as_json,
     contribution_addr_from_round_as_json, contribution_info_from_grant_during_round_as_json, create_matching_pledge_v1,
     flag, get_clr_sybil_input, get_collection, get_collections_list, get_ethereum_cart_data, get_grant_payload,
@@ -30,7 +30,7 @@
     grant_new, grants, grants_addr_as_json, grants_bulk_add, grants_by_grant_type, grants_cart_view, grants_info,
     grants_landing, grants_type_redirect, ingest_contributions, ingest_contributions_view, invoice, leaderboard,
     manage_ethereum_cart_data, new_matching_partner, profile, quickstart, remove_grant_from_collection, save_collection,
-    toggle_grant_favorite, toggle_user_sybil, verify_grant,
+    toggle_grant_favorite, upload_sybil_csv, verify_grant,
 )
 
 app_name = 'grants/'
@@ -116,6 +116,7 @@
 
     # custom API
     path('v1/api/get-clr-data/<int:round_id>', get_clr_sybil_input, name='get_clr_sybil_input'),
-    path('v1/api/toggle_user_sybil', toggle_user_sybil, name='toggle_user_sybil')
+    path('v1/api/toggle_user_sybil', api_toggle_user_sybil, name='api_toggle_user_sybil'),
+    path('v1/api/upload_sybil_csv', upload_sybil_csv, name='upload_sybil_csv')
 
 ]
diff --git a/app/grants/utils.py b/app/grants/utils.py
@@ -18,6 +18,7 @@
 
 """
 import logging
+import math
 import os
 import re
 import urllib.request
@@ -28,6 +29,8 @@
 
 from django.utils import timezone
 
+import numpy as np
+import pandas as pd
 from app.settings import BASE_URL, MEDIA_URL, NOTION_API_KEY, NOTION_SYBIL_DB
 from app.utils import notion_write
 from avatar.utils import convert_img
@@ -41,8 +44,9 @@
 from grants.sync.rsk import sync_rsk_payout
 from grants.sync.zcash import sync_zcash_payout
 from grants.sync.zil import sync_zil_payout
-from perftools.models import JSONStore, StaticJsonEnv
+from perftools.models import StaticJsonEnv
 from PIL import Image, ImageDraw, ImageOps
+from townsquare.models import SquelchProfile
 
 logger = logging.getLogger(__name__)
 
@@ -308,3 +312,134 @@ def save_grant_to_notion(grant):
                 }]
             }
         })
+
+
+def toggle_user_sybil(sybil_users, non_sybil_users):
+    '''util function which marks users as sybil/not'''
+
+    from dashboard.models import Profile
+
+    squelched_profiles = SquelchProfile.objects.all()
+    if sybil_users:
+        # iterate through users which need to be packed as sybil
+        for user in sybil_users:
+            try:
+                # get user profile. note
+                profile = Profile.objects.filter(handle=user.get('handle')).first()
+                if profile:
+                    label = user.get('label')
+                    comment = user.get('comment')
+
+                    if comment and isNaN(comment):
+                        comment = 'added by bsci'
+
+                    # check if user has entry in SquelchProfile
+                    if (
+                        not squelched_profiles.filter(profile=profile).first() and
+                        label and comment
+                    ):
+                        # mark user as sybil
+                        SquelchProfile.objects.create(
+                            profile=profile,
+                            label=label,
+                            comments=comment
+                        )
+                else:
+                    print(f"error: profile not found for ${user.get('handle')} as sybil.")
+            except Exception as e:
+                print(f"error: unable to mark user ${user.get('handle')} as sybil. {e}")
+
+    if non_sybil_users:
+        # iterate and remove sybil from user
+        for user in non_sybil_users:
+            try:
+                profile = Profile.objects.get(pk=user.get('id'))
+                squelched_profiles.filter(profile=profile).delete()
+            except Exception as e:
+                print(f"error: unable to mark ${user.get('id')} as non sybil. {e}")
+
+
+
+def bsci_script(csv):
+    try:
+        # choose the specific csv you want to use
+        endpoint_df = pd.read_csv(csv)
+
+        sybil_df = pd.DataFrame()
+        non_sybil_df = pd.DataFrame()
+        '''
+        filters human labeled sybils ('reviewer_is_certain (0/1)' and 'is_sybil_y' values can be adjusted)
+        human_sybil_score could also be used as a filter is wanted
+        '''
+        human_sybil = endpoint_df[(endpoint_df['flag_type_y'] == 'Human') & (endpoint_df['reviewer_is_certain (0/1)_y'] >= 0.99)  & (endpoint_df['is_sybil_y'] >= 0.99)]
+        endpoint_df = endpoint_df[~endpoint_df.handle.isin(human_sybil.handle)]
+        human_sybil = human_sybil[['handle', 'flag_type_y', 'notes']]
+        human_sybil = human_sybil.rename({'handle': 'handle', 'flag_type_y': 'label', 'notes': 'comment'}, axis = 1, inplace = True)
+        sybil_df = sybil_df.append(human_sybil)
+
+        '''
+        filters heuristic labeled sybils, nothing can be adjusted here
+        '''
+        heuristic_sybil = endpoint_df[(endpoint_df['flag_type_x'] == 'Heuristic') & (endpoint_df['ml_score'] >= 0.99)]
+        endpoint_df = endpoint_df[~endpoint_df.handle.isin(heuristic_sybil.handle)]
+        heuristic_sybil = heuristic_sybil[['handle', 'flag_type_x', 'notes']]
+        hueristic_sybil = heuristic_sybil.rename({'handle': 'handle', 'flag_type_x': 'label', 'notes': 'comment'}, axis = 1, inplace = True)
+        sybil_df = sybil_df.append(heuristic_sybil)
+
+        '''
+        filters ml predicted sybils, ml_score can be adjusted to be either higher or lower
+        higher ml_score means less people are likely to appeal, but potentially some sybils slip through
+        lower ml_score means more people are likely to appeal, but more sybils are potentially caught
+        '''
+        ml_sybil = endpoint_df[(endpoint_df['flag_type_x'] == 'Prediction') & (endpoint_df['ml_score'] >= 0.9)]
+        endpoint_df = endpoint_df[~endpoint_df.handle.isin(ml_sybil.handle)]
+        ml_sybil = ml_sybil[['handle', 'flag_type_x', 'notes']]
+        ml_sybil = ml_sybil.rename({'handle': 'handle', 'flag_type_x': 'label', 'notes': 'comment'}, axis = 1, inplace = True)
+        sybil_df = sybil_df.append(ml_sybil)
+
+        '''
+        filters human labeled non-sybil users
+        nothing here should be changed as these are just the remaining users that were marked by humans not included in the sybil filtering
+        '''
+        human_non_sybil = endpoint_df[(endpoint_df['flag_type_y'] == 'Human') & (endpoint_df['reviewer_is_certain (0/1)_y'] != np.nan)]
+        endpoint_df = endpoint_df[~endpoint_df.handle.isin(human_non_sybil.handle)]
+        human_non_sybil = human_non_sybil[['handle', 'flag_type_y', 'notes']]
+        human_non_sybil = human_non_sybil.rename({'handle': 'handle', 'flag_type_y': 'label', 'notes': 'comment'}, axis = 1, inplace = True)
+        non_sybil_df = non_sybil_df.append(human_non_sybil)
+
+        '''
+        filters heuristic non sybils, nothing here needs to be adjusted
+        '''
+        heuristic_non_sybil = endpoint_df[(endpoint_df['flag_type_x'] == 'Heuristic') & (endpoint_df['ml_score'] <= 0.01)]
+        endpoint_df = endpoint_df[~endpoint_df.handle.isin(heuristic_non_sybil.handle)]
+        heuristic_non_sybil = heuristic_non_sybil[['handle', 'flag_type_x', 'notes']]
+        hueristic_non_sybil = heuristic_non_sybil.rename({'handle': 'handle', 'flag_type_x': 'label', 'notes': 'comment'}, axis = 1, inplace = True)
+        non_sybil_df = non_sybil_df.append(heuristic_non_sybil)
+
+        '''
+        This just filters out the remaining users that were not filtered in the previous sections, nothing can be adjusted here
+        '''
+        ml_non_sybil = endpoint_df
+        ml_non_sybil = ml_non_sybil[['handle', 'flag_type_x', 'notes']]
+        ml_non_sybil = ml_non_sybil.rename({'handle': 'handle', 'flag_type_x': 'label', 'notes': 'comment'}, axis = 1, inplace = True)
+        non_sybil_df = non_sybil_df.append(ml_non_sybil)
+
+        '''
+        conversion of all the data so that it can be pushed to the toggle_user_sybil endpoint
+        '''
+        #sybil_df = ml_df[ml_df['ml_score'] >= 0.9 and ml_df['flag_type'] != 'Human']
+        sybil_users = sybil_df.to_dict('records')
+        non_sybil_users = non_sybil_df.to_dict('records')
+
+        # print('=================SYBIL=================')
+        # print(sybil_users)
+        # print('=================NON SYBIL=================')
+        # print(non_sybil_users)
+
+        toggle_user_sybil(sybil_users, non_sybil_users)
+
+    except Exception as e:
+        logger.error(f'error: bsci_sybil_script - {e}')
+
+def isNaN(string):
+    return string != string