akeys = ['SACRED', 'Weights and Biases', 'ML Flow', 'Comet ML', 'Neptune.ai',
         'Valohai', 'Guild AI', 'Spell.ML', 'DVC', 'CML', 'Yacs', 'Atlas', 
         'Sirio ML', 'Verta AI', 'Pachyderm', 'MLeap', 'Soda', 'Kubeflow', 
         'ClearML', 'FlexP', 'ML Pipeline', 'CNVRG', 'Jovian ML', 'Datajoint', 
         'Studio', 'Datmo', 'Lore', 'FORGE', 'Sumatra', 'RandOpt', 'feature Forge',
         'ModelChimp', 'PolyAxon', 'Optuna', 'ML Commons', 'OctoML', 'Algorithmia', 
         'Flyte', 'MetaFlow', 'Datarobot', 'Dataiku', 'Allegro AI', 'Iguazio', 'DBT',
         'sisyphus', 'Seldon']
fkeys = ['Abacus AI','Aible','Cubonacci','Databricks','Datagrok','Dataiku','DataRobot',
         'Dessa','dotData','Elementl','H2O','HIVE','Iguazio','kedro','Michelangelo','MLFlow',
         'Obliviously AI','Peltarion','Petuum','Picsell.ia','Polyaxon','Robust AI',
         'Snorkel AI','Stradigi AI','Supervisely','Tecton','Xpanse AI','Aircloak',
         'Alluxio','Alteryx','Amazon Redshift','Amundsen','Anodot','Apache Druid',
         'Apache Hudi','Apache ORC','Aparavi','AresDB','Ascend.io','AtScale','Cazena',
         'ClearSky Data','Cohesity','Confluent','cuDF','Dask','Datatable','Dataturks',
         'Datera','DefinedCrowd','Delta Lake','Doccano','Dolt','Dremio','DVC - Iterative.ai',
         'Elastifile','erwin','Excelero','Facets','FEAST','Figure Eight','Fluree','Gemini Data',
         'Git LFS','Gluent','Graviti','Gretel AI','Hammerspace','Heartex Label Studio','Igneous',
         'iMerit','Imply','Incorta','Kimono Labs','Koalas','Komprise','Kyvos Insights','Labelbox','LabelImg',
         'Materialize','Milvus','Modin','Naveego','Octopai','Pachyderm','Parquet','Pilosa','Playment',
         'Presto','Prodigy','Prometheus','Qri','Quilt Data','Quobyte','Rockset','Rubrik','Scale AI',
         'Scrapinghub','Segments.ai','Sisu','Snorkel','Spark','SQLFlow','Starburst Data','Storbyte',
         'Superb AI','Synthetaic','Tamr','TerminusDB','Tumult Labs','V7Labs','Vaex','Vearch','Vexata',
         'Voxel51 // Scoop','Waterline Data','Yellowbrick Data','Zilliz','Blaize','Boulder AI','BrainChip',
         'Cambricon','Cerebras','EdgeQ','Graphcore','GreenWaves Technologies','Groq','Habana Labs','Hailo',
         'Kneron','LeapMind','Lightelligence','Luminous Computing','Mythic','Nuvia','Prophesee','SambaNova',
         'SiMa.ai','Syntiant','Wave Computing','Zero ASIC','Airflow','Anyscale','Backend AI','Cadence',
         'Cloudera','Datadog','Domino Data Lab','FloydHub','Flyte','HYCU','Luigi','Metaflow','Paperspace',
         'Prefect','Valohai','Accord','AIMET','Alectio','Alink','Allegro AI/TRAINS','AllenNLP','Angel ML',
         'Apache Mahout','Apache MXNet','BentoML','Boruta','Caffe','Catalyst','Chainer','CleverHans','Colab',
         'Comet','DAGsHub','DarwinAI','Datmo','DAWNBench','DeepNote','Determined AI','Dialogflow','Dockship',
         'euler','explainX.ai','fastText','Featuretools','FedAI (FATE)','Fiddler Labs','Fiddler Labs','flair',
         'Gensim','GluonCV','Grid AI','Horovod','Hugging Face','HyperOpt','InterpretML','JAX','Katib','Kyndi',
         'LightGBM','LIME','Lucid','Ludwig','Matroid','Mindspore','ML.NET','MLlib','MLPerf','NeMo','Neptune',
         'Netron','nteract','OpenSeq2Seq','Paddle','papermill','PerceptiLabs','PlaidML','Pyro','PySyft',
         'Pythia','PyTorch','PyTorch Lightning','Rasa','Ray','Replicate','River','scikit-learn','scribble Data',
         'SHAP','SigOpt','spaCy','Spell','Streamlit','talos','Tazi.ai','Tensorboard','TensorFlow','Theano',
         'TPOT','TransmogrifAI','Truera','Truera','tsfresh','Tune','Turi Create','Vowpal Wabbit','Weights & Biases',
         'XGBoost','Algorithmia','Apache Flink','Apache Kafka','Apache TVM','Argo','Arize AI','Arthur AI','Clipper',
         'Core ML','Cortex','Dash','Datatron','Deeplite','Evidently AI','Formant','Fritz AI','Gradio','Inferrd',
         'Kubeflow','Losswise','ML Kit','MMdnn','MNN','Mona Labs','ncnn','Neural Network Distiller','OctoML',
         'ONNX','Plotly','PredictionIO','RelicX','Seldon','superwise.ai','TensorFlow Extended','TensorFlow Lite',
         'TensorRT','Unravel Data','VertaAI','Xnor.ai']

Maybe useful link. PyDriller is a project for analyzing the code in git repos. https://github.com/ishepard/pydriller

from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

lds = []
for akey in akeys:
    tmp = []
    for fkey in fkeys:
        tmp.append({
            'aval': akey,
            'fval': fkey,
            'score': similar(a=akey, b=fkey)
        })
    lds.append(tmp)
    
    
def top_k(listdict, key_name, k, direction):
    """returns top k results sorting greatest to least  or least to greatest on key_name col"""
    
    if direction == 'ltg':
        s = sorted(listdict, key=lambda x: x[key_name], reverse=False)
    
    if direction == 'gtl':
        s = sorted(listdict, key=lambda x: x[key_name], reverse=True)
        
    return s[:k]
    

k = 3
out = []
for ld in lds:
    out.append({
        'key': ld[0]['aval'],
        'matches': top_k(listdict=ld, key_name='score', k=k, direction='gtl')
    })

thresh = 0.65
filtered = []
for obj in out:
    if obj['matches'][0]['score'] < thresh:
        continue
    filtered.append({
        'key': obj['key'],
        'match': obj['matches'][0]
    })

print('matches:', len(filtered), 'out of: ', len(akeys))
len(filtered) / len(akeys)*1.0
matches: 22 out of:  46
0.4782608695652174

import pandas as pd
pd.DataFrame([{'key': fil['key'], 'match': fil['match']['fval']} for fil in filtered]).to_csv('corr1.csv')

import os
import pandas as pd
from ghapi.core import GhApi,HTTP403ForbiddenError
import ghapi
from urllib.parse import urlparse
import time
from pathlib import Path
from datetime import datetime, timedelta

linkedin_url = 'https://www.linkedin.com/company/iterative-ai/'
github_url = 'https://github.com/iterative/dvc'
company_url = 'https://dvc.org/'

class GithubInfo:
    def __init__(self, ghapi_client, url, cachedir='gh_data'):
        self.url = url
        self.cachedir = Path(os.getcwd())/cachedir
        self.cachedir.mkdir(parents=True, exist_ok=True)
        self.ghapi_client = ghapi_client
        
        comps = self.parse_gh_url()
        self.owner = comps['owner']
        self.repo = comps['repo']
        
        self.alerts = self.get_alerts()
        self.get_all_prs()
        self.prs.index = self.prs['id']
        self.pr_analysis = {}
        self.analyze_prs()
    
        
    def parse_gh_url(self):
        comps = urlparse(self.url)
        return {
            'owner':comps.path.split('/')[1],
            'repo': comps.path.split('/')[2]
        }


    def get_alerts(self):
        try: 
            return self.ghapi_client.code_scanning.list_alerts_for_repo(owner=self.owner, repo=self.repo)
        except ghapi.core.HTTP403ForbiddenError:
            return 'Advanced security disabled.'
        except ghapi.core.HTTP404NotFoundError:
            return 'The ref does not match an existing ref'
        except ghapi.core.HTTP5xxServerError:
            return 'Service unavailable'


    def make_cache_fp(self,object_type):
        today = datetime.today().strftime('%Y-%m-%d')
        fn = '_'.join([self.owner,self.repo,today])+'.csv.gz'
        return self.cachedir/object_type/fn


    def download_prs(self):
        prs = []
        page = 0
        print('getting prs, storing in', cache_fp)
        while True:
            try:
                new_page = self.ghapi_client.pulls.list(
                    owner=self.owner, repo=self.repo, state='all', per_page=100, page=page
                    )
            except:
                break
                
            if len(new_page) == 0:
                break
        
            prs += new_page
            page += 1
            time.sleep(1)
        
        return prs
    
    
    def get_all_prs(self):        
        cache_fp = self.make_cache_fp(object_type='prs')
        if cache_fp.exists():
            print('using existing prs file.', cache_fp)
            return pd.read_csv(cache_fp, compression='gzip')
        
        cache_fp.parent.mkdir(parents=True, exist_ok=True)
        # TODO: Get yesterday's file and find out which rows to get.
        self.prs = pd.DataFrame(
            data=self.download_prs()
        )
        self.prs.to_csv(cache_fp, index=False, compression='gzip')
        
        return 
    
    
    def analyze_prs(self):
        first_pr = self.return_first_pr()
        self.pr_analysis['first_pr_created_at'] = first_pr['created_at']
        self.pr_analysis['first_pr_merged_at'] = first_pr['merged_at']
        
        
    
    def return_first_pr(self):
        tmp = self.prs.sort_values('created_at')
        out = tmp.iloc[0]
        del tmp
        return out

gh_client = GhApi()
GH = GithubInfo(ghapi_client=gh_client, url=github_url)
using existing file. /ws/forks/prcvd/nbs/gh_data/prs/iterative_dvc_2021-01-14.csv.gz

GH.pr_analysis
{'first_pr_created_at': '2017-04-07T23:33:45Z',
 'first_pr_merged_at': '2017-04-07T23:40:57Z'}

gh_client.rate_limit.get()
  • resources:
    • core:
      • limit: 60
      • remaining: 0
      • reset: 1610665925
      • used: 60
    • graphql:
      • limit: 0
      • remaining: 0
      • reset: 1610667301
      • used: 0
    • integration_manifest:
      • limit: 5000
      • remaining: 5000
      • reset: 1610667301
      • used: 0
    • search:
      • limit: 10
      • remaining: 10
      • reset: 1610663761
      • used: 0
  • rate:
    • limit: 60
    • remaining: 0
    • reset: 1610665925
    • used: 60
"""
Ideas:
    - detect anomalies on github project deliveries (missed deadlines, etc..)
    
    
Versions
    -
    
PR Overall question:
    - First PR merged on date DONE
    - Average days between PRs open
    - Average days between PRs merged
    
PR overall questions per week:
    - PRs(week)
    - Devs(week)
    - 
    
Per PR questions:
    - average time per PR
    - average size per PR
    - average contributors per PR

Issues overall per week:
    - new issues
    - resolved issues
    

"""

pr_df = pd.DataFrame(GH.prs).sort_values('created_at')

t1 = pd.DataFrame(pd.to_datetime(pr_df['merged_at'])).sort_values('merged_at')
t1['shift1'] = t1['merged_at'].shift(1)
t1.dropna(how='any', inplace=True)

t1['diff'] = t1['merged_at'] - t1['shift1']
t1['diff'] = t1['diff'].apply(lambda x: x.total_seconds())

t1
merged_at shift1 diff
2710 2017-04-08 00:43:05+00:00 2017-04-07 23:40:57+00:00 3728.0
2709 2017-04-10 19:27:00+00:00 2017-04-08 00:43:05+00:00 240235.0
2708 2017-04-12 02:40:13+00:00 2017-04-10 19:27:00+00:00 112393.0
2707 2017-04-12 05:15:38+00:00 2017-04-12 02:40:13+00:00 9325.0
2706 2017-04-18 00:08:39+00:00 2017-04-12 05:15:38+00:00 499981.0
... ... ... ...
60 2021-01-14 15:23:21+00:00 2021-01-14 15:23:21+00:00 0.0
1 2021-01-14 18:07:24+00:00 2021-01-14 15:23:21+00:00 9843.0
101 2021-01-14 18:07:24+00:00 2021-01-14 18:07:24+00:00 0.0
2 2021-01-14 19:10:54+00:00 2021-01-14 18:07:24+00:00 3810.0
102 2021-01-14 19:10:54+00:00 2021-01-14 19:10:54+00:00 0.0

2405 rows × 3 columns

type(t1.loc[0]['shift1'])
NoneType

def compute_mtb(df, feature, stat):
    """Computes mean time between"""
    tmp = df.sort_values(feature)
    tmp = pd.DataFrame(pd.to_datetime(df['merged_at']))
    tmp[feature+'_shift1'] = tmp[feature].shift(1)
    tmp[feature+'_diff'] = tmp[feature+'_shift1'] - tmp[feature]
    if stat == 'mean':
        return tmp[feature+'_diff'].mean().total_seconds()
    
    elif stat == 'min':
        return tmp[feature+'_diff'].min().total_seconds()
    
    elif stat == 'max':
        return tmp[feature+'_diff'].max().total_seconds()
    
    elif stat == 'sd':
        return tmp[feature+'_diff'].sdev().total_seconds()
    

compute_mtb(df=GH.prs, feature='merged_at', stat='mean')
48002.45843

pr_df.iloc[0]
url                    https://api.github.com/repos/iterative/dvc/pul...
id                                                             114918902
node_id                                 MDExOlB1bGxSZXF1ZXN0MTE0OTE4OTAy
html_url                        https://github.com/iterative/dvc/pull/12
diff_url                   https://github.com/iterative/dvc/pull/12.diff
patch_url                 https://github.com/iterative/dvc/pull/12.patch
issue_url              https://api.github.com/repos/iterative/dvc/iss...
number                                                                12
state                                                             closed
locked                                                             False
title                                      requirements / osx executable
user                   {'login': 'earlh', 'id': 194240, 'node_id': 'M...
body                   add development / deployment instructions in R...
created_at                                          2017-04-07T23:33:45Z
updated_at                                          2018-05-30T00:28:33Z
closed_at                                           2017-04-07T23:40:57Z
merged_at                                           2017-04-07T23:40:57Z
merge_commit_sha                9d97704ca4277297185a56cd4f8d83424ede465d
assignee                                                            None
assignees                                                             []
requested_reviewers                                                   []
requested_teams                                                       []
labels                                                                []
milestone                                                           None
draft                                                              False
commits_url            https://api.github.com/repos/iterative/dvc/pul...
review_comments_url    https://api.github.com/repos/iterative/dvc/pul...
review_comment_url     https://api.github.com/repos/iterative/dvc/pul...
comments_url           https://api.github.com/repos/iterative/dvc/iss...
statuses_url           https://api.github.com/repos/iterative/dvc/sta...
head                   {'label': 'iterative:elh_dist', 'ref': 'elh_di...
base                   {'label': 'iterative:master', 'ref': 'master',...
_links                 {'self': {'href': 'https://api.github.com/repo...
author_association                                           CONTRIBUTOR
active_lock_reason                                                  None
Name: 2711, dtype: object