Skip to content

Commit

Permalink
csv to upload to S3 (#9486)
Browse files Browse the repository at this point in the history
* csv to upload to S3

* chore: wire in bsci script + split functions + add todo

* chore: wire in S3 fetch

* chore: configure upload/download to S3

* bug fixes

* add migrations

* uncomment S3 upload

* Uploads csv file to s3

* Reads from s3

Co-authored-by: Graham Dixon <[email protected]>
  • Loading branch information
thelostone-mc and gdixon authored Sep 22, 2021
1 parent 98cc666 commit f6f9f0d
Show file tree
Hide file tree
Showing 9 changed files with 254 additions and 44 deletions.
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ RUN apt-get update

# Install general dependencies.
RUN apt-get install -y $PACKAGES
RUN apt-get update
RUN apt-get install -y $BUILD_DEPS
RUN apt-get update --fix-missing
RUN apt-get install -y $BUILD_DEPS --fix-missing

RUN apt-get install -y wget
RUN apt-get install -y libsodium-dev
Expand Down
2 changes: 2 additions & 0 deletions app/app/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -771,6 +771,8 @@ def callback(request):
S3_REPORT_BUCKET = env('S3_REPORT_BUCKET', default='') # TODO
S3_REPORT_PREFIX = env('S3_REPORT_PREFIX', default='') # TODO

S3_BSCI_SYBIL_BUCKET = env('S3_REPORT_BUCKET', default='') # TODO

INSTALLED_APPS += env.list('DEBUG_APPS', default=[])


Expand Down
27 changes: 25 additions & 2 deletions app/grants/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,24 @@
import math
import time
from decimal import Decimal
from io import StringIO

from django.conf import settings
from django.utils import timezone
from django.utils.text import slugify

import boto3
from app.services import RedisService
from celery import app
from celery.utils.log import get_task_logger
from dashboard.models import Profile
from grants.models import Grant, GrantCLR, GrantCollection, Subscription
from grants.utils import get_clr_rounds_metadata, save_grant_to_notion
from grants.utils import bsci_script, get_clr_rounds_metadata, save_grant_to_notion
from marketing.mails import (
new_contributions, new_grant, new_grant_admin, notion_failure_email, thank_you_for_supporting,
)
from townsquare.models import Comment
from perftools.models import StaticJsonEnv
from townsquare.models import Comment, SquelchProfile
from unidecode import unidecode

logger = get_task_logger(__name__)
Expand Down Expand Up @@ -421,3 +424,23 @@ def generate_collection_cache(self, collection_id):
collection.generate_cache()
except Exception as e:
print(e)


@app.shared_task(bind=True, max_retries=3)
def process_bsci_sybil_csv(self, file_name, csv):
'''fetch csv from bsci and toggle'''

if not file_name:
bsciJSON = StaticJsonEnv.objects.get(key='BSCI_SYBIL_TOKEN')
data = bsciJSON.data
file_name = data['csv_url']

if not csv:
client = boto3.client('s3', aws_access_key_id=settings.AWS_ACCESS_KEY_ID, aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY)
csv_object = client.get_object(Bucket=settings.S3_BSCI_SYBIL_BUCKET, Key=file_name)
csv = csv_object['Body']

csv = StringIO(csv.read().decode('utf-8'))

# run bsci script
bsci_script(csv)
9 changes: 5 additions & 4 deletions app/grants/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
from django.urls import path, re_path

from grants.views import (
add_grant_from_collection, bulk_fund, bulk_grants_for_cart, cancel_grant_v1, cart_thumbnail, clr_grants, collage,
collection_thumbnail, contribute_to_grants_v1, contribution_addr_from_all_as_json,
add_grant_from_collection, api_toggle_user_sybil, bulk_fund, bulk_grants_for_cart, cancel_grant_v1, cart_thumbnail,
clr_grants, collage, collection_thumbnail, contribute_to_grants_v1, contribution_addr_from_all_as_json,
contribution_addr_from_grant_as_json, contribution_addr_from_grant_during_round_as_json,
contribution_addr_from_round_as_json, contribution_info_from_grant_during_round_as_json, create_matching_pledge_v1,
flag, get_clr_sybil_input, get_collection, get_collections_list, get_ethereum_cart_data, get_grant_payload,
Expand All @@ -30,7 +30,7 @@
grant_new, grants, grants_addr_as_json, grants_bulk_add, grants_by_grant_type, grants_cart_view, grants_info,
grants_landing, grants_type_redirect, ingest_contributions, ingest_contributions_view, invoice, leaderboard,
manage_ethereum_cart_data, new_matching_partner, profile, quickstart, remove_grant_from_collection, save_collection,
toggle_grant_favorite, toggle_user_sybil, verify_grant,
toggle_grant_favorite, upload_sybil_csv, verify_grant,
)

app_name = 'grants/'
Expand Down Expand Up @@ -116,6 +116,7 @@

# custom API
path('v1/api/get-clr-data/<int:round_id>', get_clr_sybil_input, name='get_clr_sybil_input'),
path('v1/api/toggle_user_sybil', toggle_user_sybil, name='toggle_user_sybil')
path('v1/api/toggle_user_sybil', api_toggle_user_sybil, name='api_toggle_user_sybil'),
path('v1/api/upload_sybil_csv', upload_sybil_csv, name='upload_sybil_csv')

]
137 changes: 136 additions & 1 deletion app/grants/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"""
import logging
import math
import os
import re
import urllib.request
Expand All @@ -28,6 +29,8 @@

from django.utils import timezone

import numpy as np
import pandas as pd
from app.settings import BASE_URL, MEDIA_URL, NOTION_API_KEY, NOTION_SYBIL_DB
from app.utils import notion_write
from avatar.utils import convert_img
Expand All @@ -41,8 +44,9 @@
from grants.sync.rsk import sync_rsk_payout
from grants.sync.zcash import sync_zcash_payout
from grants.sync.zil import sync_zil_payout
from perftools.models import JSONStore, StaticJsonEnv
from perftools.models import StaticJsonEnv
from PIL import Image, ImageDraw, ImageOps
from townsquare.models import SquelchProfile

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -308,3 +312,134 @@ def save_grant_to_notion(grant):
}]
}
})


def toggle_user_sybil(sybil_users, non_sybil_users):
'''util function which marks users as sybil/not'''

from dashboard.models import Profile

squelched_profiles = SquelchProfile.objects.all()
if sybil_users:
# iterate through users which need to be packed as sybil
for user in sybil_users:
try:
# get user profile. note
profile = Profile.objects.filter(handle=user.get('handle')).first()
if profile:
label = user.get('label')
comment = user.get('comment')

if comment and isNaN(comment):
comment = 'added by bsci'

# check if user has entry in SquelchProfile
if (
not squelched_profiles.filter(profile=profile).first() and
label and comment
):
# mark user as sybil
SquelchProfile.objects.create(
profile=profile,
label=label,
comments=comment
)
else:
print(f"error: profile not found for ${user.get('handle')} as sybil.")
except Exception as e:
print(f"error: unable to mark user ${user.get('handle')} as sybil. {e}")

if non_sybil_users:
# iterate and remove sybil from user
for user in non_sybil_users:
try:
profile = Profile.objects.get(pk=user.get('id'))
squelched_profiles.filter(profile=profile).delete()
except Exception as e:
print(f"error: unable to mark ${user.get('id')} as non sybil. {e}")



def bsci_script(csv):
try:
# choose the specific csv you want to use
endpoint_df = pd.read_csv(csv)

sybil_df = pd.DataFrame()
non_sybil_df = pd.DataFrame()
'''
filters human labeled sybils ('reviewer_is_certain (0/1)' and 'is_sybil_y' values can be adjusted)
human_sybil_score could also be used as a filter is wanted
'''
human_sybil = endpoint_df[(endpoint_df['flag_type_y'] == 'Human') & (endpoint_df['reviewer_is_certain (0/1)_y'] >= 0.99) & (endpoint_df['is_sybil_y'] >= 0.99)]
endpoint_df = endpoint_df[~endpoint_df.handle.isin(human_sybil.handle)]
human_sybil = human_sybil[['handle', 'flag_type_y', 'notes']]
human_sybil = human_sybil.rename({'handle': 'handle', 'flag_type_y': 'label', 'notes': 'comment'}, axis = 1, inplace = True)
sybil_df = sybil_df.append(human_sybil)

'''
filters heuristic labeled sybils, nothing can be adjusted here
'''
heuristic_sybil = endpoint_df[(endpoint_df['flag_type_x'] == 'Heuristic') & (endpoint_df['ml_score'] >= 0.99)]
endpoint_df = endpoint_df[~endpoint_df.handle.isin(heuristic_sybil.handle)]
heuristic_sybil = heuristic_sybil[['handle', 'flag_type_x', 'notes']]
hueristic_sybil = heuristic_sybil.rename({'handle': 'handle', 'flag_type_x': 'label', 'notes': 'comment'}, axis = 1, inplace = True)
sybil_df = sybil_df.append(heuristic_sybil)

'''
filters ml predicted sybils, ml_score can be adjusted to be either higher or lower
higher ml_score means less people are likely to appeal, but potentially some sybils slip through
lower ml_score means more people are likely to appeal, but more sybils are potentially caught
'''
ml_sybil = endpoint_df[(endpoint_df['flag_type_x'] == 'Prediction') & (endpoint_df['ml_score'] >= 0.9)]
endpoint_df = endpoint_df[~endpoint_df.handle.isin(ml_sybil.handle)]
ml_sybil = ml_sybil[['handle', 'flag_type_x', 'notes']]
ml_sybil = ml_sybil.rename({'handle': 'handle', 'flag_type_x': 'label', 'notes': 'comment'}, axis = 1, inplace = True)
sybil_df = sybil_df.append(ml_sybil)

'''
filters human labeled non-sybil users
nothing here should be changed as these are just the remaining users that were marked by humans not included in the sybil filtering
'''
human_non_sybil = endpoint_df[(endpoint_df['flag_type_y'] == 'Human') & (endpoint_df['reviewer_is_certain (0/1)_y'] != np.nan)]
endpoint_df = endpoint_df[~endpoint_df.handle.isin(human_non_sybil.handle)]
human_non_sybil = human_non_sybil[['handle', 'flag_type_y', 'notes']]
human_non_sybil = human_non_sybil.rename({'handle': 'handle', 'flag_type_y': 'label', 'notes': 'comment'}, axis = 1, inplace = True)
non_sybil_df = non_sybil_df.append(human_non_sybil)

'''
filters heuristic non sybils, nothing here needs to be adjusted
'''
heuristic_non_sybil = endpoint_df[(endpoint_df['flag_type_x'] == 'Heuristic') & (endpoint_df['ml_score'] <= 0.01)]
endpoint_df = endpoint_df[~endpoint_df.handle.isin(heuristic_non_sybil.handle)]
heuristic_non_sybil = heuristic_non_sybil[['handle', 'flag_type_x', 'notes']]
hueristic_non_sybil = heuristic_non_sybil.rename({'handle': 'handle', 'flag_type_x': 'label', 'notes': 'comment'}, axis = 1, inplace = True)
non_sybil_df = non_sybil_df.append(heuristic_non_sybil)

'''
This just filters out the remaining users that were not filtered in the previous sections, nothing can be adjusted here
'''
ml_non_sybil = endpoint_df
ml_non_sybil = ml_non_sybil[['handle', 'flag_type_x', 'notes']]
ml_non_sybil = ml_non_sybil.rename({'handle': 'handle', 'flag_type_x': 'label', 'notes': 'comment'}, axis = 1, inplace = True)
non_sybil_df = non_sybil_df.append(ml_non_sybil)

'''
conversion of all the data so that it can be pushed to the toggle_user_sybil endpoint
'''
#sybil_df = ml_df[ml_df['ml_score'] >= 0.9 and ml_df['flag_type'] != 'Human']
sybil_users = sybil_df.to_dict('records')
non_sybil_users = non_sybil_df.to_dict('records')

# print('=================SYBIL=================')
# print(sybil_users)
# print('=================NON SYBIL=================')
# print(non_sybil_users)

toggle_user_sybil(sybil_users, non_sybil_users)

except Exception as e:
logger.error(f'error: bsci_sybil_script - {e}')

def isNaN(string):
return string != string
Loading

0 comments on commit f6f9f0d

Please sign in to comment.