Scraping Github and PiPy to Understand Project Stability
New industrial segments in softwaree grow rapidly by addressing problems that have been holding back other software use cases. The MLOps industry, for example, is adding 2-3 new products per week (2021) meant to solve various pitfalls developers face in the pursuit of deploying machine learning software. This notebook automates some of the metrics that can be used to understand project code stability. Github and PiPy are open repos for registering software libraries that can be downloaded easily by users. We use those to estimate stability and popularity.
akeys = ['SACRED', 'Weights and Biases', 'ML Flow', 'Comet ML', 'Neptune.ai',
'Valohai', 'Guild AI', 'Spell.ML', 'DVC', 'CML', 'Yacs', 'Atlas',
'Sirio ML', 'Verta AI', 'Pachyderm', 'MLeap', 'Soda', 'Kubeflow',
'ClearML', 'FlexP', 'ML Pipeline', 'CNVRG', 'Jovian ML', 'Datajoint',
'Studio', 'Datmo', 'Lore', 'FORGE', 'Sumatra', 'RandOpt', 'feature Forge',
'ModelChimp', 'PolyAxon', 'Optuna', 'ML Commons', 'OctoML', 'Algorithmia',
'Flyte', 'MetaFlow', 'Datarobot', 'Dataiku', 'Allegro AI', 'Iguazio', 'DBT',
'sisyphus', 'Seldon']
fkeys = ['Abacus AI','Aible','Cubonacci','Databricks','Datagrok','Dataiku','DataRobot',
'Dessa','dotData','Elementl','H2O','HIVE','Iguazio','kedro','Michelangelo','MLFlow',
'Obliviously AI','Peltarion','Petuum','Picsell.ia','Polyaxon','Robust AI',
'Snorkel AI','Stradigi AI','Supervisely','Tecton','Xpanse AI','Aircloak',
'Alluxio','Alteryx','Amazon Redshift','Amundsen','Anodot','Apache Druid',
'Apache Hudi','Apache ORC','Aparavi','AresDB','Ascend.io','AtScale','Cazena',
'ClearSky Data','Cohesity','Confluent','cuDF','Dask','Datatable','Dataturks',
'Datera','DefinedCrowd','Delta Lake','Doccano','Dolt','Dremio','DVC - Iterative.ai',
'Elastifile','erwin','Excelero','Facets','FEAST','Figure Eight','Fluree','Gemini Data',
'Git LFS','Gluent','Graviti','Gretel AI','Hammerspace','Heartex Label Studio','Igneous',
'iMerit','Imply','Incorta','Kimono Labs','Koalas','Komprise','Kyvos Insights','Labelbox','LabelImg',
'Materialize','Milvus','Modin','Naveego','Octopai','Pachyderm','Parquet','Pilosa','Playment',
'Presto','Prodigy','Prometheus','Qri','Quilt Data','Quobyte','Rockset','Rubrik','Scale AI',
'Scrapinghub','Segments.ai','Sisu','Snorkel','Spark','SQLFlow','Starburst Data','Storbyte',
'Superb AI','Synthetaic','Tamr','TerminusDB','Tumult Labs','V7Labs','Vaex','Vearch','Vexata',
'Voxel51 // Scoop','Waterline Data','Yellowbrick Data','Zilliz','Blaize','Boulder AI','BrainChip',
'Cambricon','Cerebras','EdgeQ','Graphcore','GreenWaves Technologies','Groq','Habana Labs','Hailo',
'Kneron','LeapMind','Lightelligence','Luminous Computing','Mythic','Nuvia','Prophesee','SambaNova',
'SiMa.ai','Syntiant','Wave Computing','Zero ASIC','Airflow','Anyscale','Backend AI','Cadence',
'Cloudera','Datadog','Domino Data Lab','FloydHub','Flyte','HYCU','Luigi','Metaflow','Paperspace',
'Prefect','Valohai','Accord','AIMET','Alectio','Alink','Allegro AI/TRAINS','AllenNLP','Angel ML',
'Apache Mahout','Apache MXNet','BentoML','Boruta','Caffe','Catalyst','Chainer','CleverHans','Colab',
'Comet','DAGsHub','DarwinAI','Datmo','DAWNBench','DeepNote','Determined AI','Dialogflow','Dockship',
'euler','explainX.ai','fastText','Featuretools','FedAI (FATE)','Fiddler Labs','Fiddler Labs','flair',
'Gensim','GluonCV','Grid AI','Horovod','Hugging Face','HyperOpt','InterpretML','JAX','Katib','Kyndi',
'LightGBM','LIME','Lucid','Ludwig','Matroid','Mindspore','ML.NET','MLlib','MLPerf','NeMo','Neptune',
'Netron','nteract','OpenSeq2Seq','Paddle','papermill','PerceptiLabs','PlaidML','Pyro','PySyft',
'Pythia','PyTorch','PyTorch Lightning','Rasa','Ray','Replicate','River','scikit-learn','scribble Data',
'SHAP','SigOpt','spaCy','Spell','Streamlit','talos','Tazi.ai','Tensorboard','TensorFlow','Theano',
'TPOT','TransmogrifAI','Truera','Truera','tsfresh','Tune','Turi Create','Vowpal Wabbit','Weights & Biases',
'XGBoost','Algorithmia','Apache Flink','Apache Kafka','Apache TVM','Argo','Arize AI','Arthur AI','Clipper',
'Core ML','Cortex','Dash','Datatron','Deeplite','Evidently AI','Formant','Fritz AI','Gradio','Inferrd',
'Kubeflow','Losswise','ML Kit','MMdnn','MNN','Mona Labs','ncnn','Neural Network Distiller','OctoML',
'ONNX','Plotly','PredictionIO','RelicX','Seldon','superwise.ai','TensorFlow Extended','TensorFlow Lite',
'TensorRT','Unravel Data','VertaAI','Xnor.ai']
Maybe useful link. PyDriller is a project for analyzing the code in git repos. https://github.com/ishepard/pydriller
from difflib import SequenceMatcher
def similar(a, b):
return SequenceMatcher(None, a, b).ratio()
lds = []
for akey in akeys:
tmp = []
for fkey in fkeys:
tmp.append({
'aval': akey,
'fval': fkey,
'score': similar(a=akey, b=fkey)
})
lds.append(tmp)
def top_k(listdict, key_name, k, direction):
"""returns top k results sorting greatest to least or least to greatest on key_name col"""
if direction == 'ltg':
s = sorted(listdict, key=lambda x: x[key_name], reverse=False)
if direction == 'gtl':
s = sorted(listdict, key=lambda x: x[key_name], reverse=True)
return s[:k]
k = 3
out = []
for ld in lds:
out.append({
'key': ld[0]['aval'],
'matches': top_k(listdict=ld, key_name='score', k=k, direction='gtl')
})
thresh = 0.65
filtered = []
for obj in out:
if obj['matches'][0]['score'] < thresh:
continue
filtered.append({
'key': obj['key'],
'match': obj['matches'][0]
})
print('matches:', len(filtered), 'out of: ', len(akeys))
len(filtered) / len(akeys)*1.0
import pandas as pd
pd.DataFrame([{'key': fil['key'], 'match': fil['match']['fval']} for fil in filtered]).to_csv('corr1.csv')
import os
import pandas as pd
from ghapi.core import GhApi,HTTP403ForbiddenError
import ghapi
from urllib.parse import urlparse
import time
from pathlib import Path
from datetime import datetime, timedelta
linkedin_url = 'https://www.linkedin.com/company/iterative-ai/'
github_url = 'https://github.com/iterative/dvc'
company_url = 'https://dvc.org/'
class GithubInfo:
def __init__(self, ghapi_client, url, cachedir='gh_data'):
self.url = url
self.cachedir = Path(os.getcwd())/cachedir
self.cachedir.mkdir(parents=True, exist_ok=True)
self.ghapi_client = ghapi_client
comps = self.parse_gh_url()
self.owner = comps['owner']
self.repo = comps['repo']
self.alerts = self.get_alerts()
self.get_all_prs()
self.prs.index = self.prs['id']
self.pr_analysis = {}
self.analyze_prs()
def parse_gh_url(self):
comps = urlparse(self.url)
return {
'owner':comps.path.split('/')[1],
'repo': comps.path.split('/')[2]
}
def get_alerts(self):
try:
return self.ghapi_client.code_scanning.list_alerts_for_repo(owner=self.owner, repo=self.repo)
except ghapi.core.HTTP403ForbiddenError:
return 'Advanced security disabled.'
except ghapi.core.HTTP404NotFoundError:
return 'The ref does not match an existing ref'
except ghapi.core.HTTP5xxServerError:
return 'Service unavailable'
def make_cache_fp(self,object_type):
today = datetime.today().strftime('%Y-%m-%d')
fn = '_'.join([self.owner,self.repo,today])+'.csv.gz'
return self.cachedir/object_type/fn
def download_prs(self):
prs = []
page = 0
print('getting prs, storing in', cache_fp)
while True:
try:
new_page = self.ghapi_client.pulls.list(
owner=self.owner, repo=self.repo, state='all', per_page=100, page=page
)
except:
break
if len(new_page) == 0:
break
prs += new_page
page += 1
time.sleep(1)
return prs
def get_all_prs(self):
cache_fp = self.make_cache_fp(object_type='prs')
if cache_fp.exists():
print('using existing prs file.', cache_fp)
return pd.read_csv(cache_fp, compression='gzip')
cache_fp.parent.mkdir(parents=True, exist_ok=True)
# TODO: Get yesterday's file and find out which rows to get.
self.prs = pd.DataFrame(
data=self.download_prs()
)
self.prs.to_csv(cache_fp, index=False, compression='gzip')
return
def analyze_prs(self):
first_pr = self.return_first_pr()
self.pr_analysis['first_pr_created_at'] = first_pr['created_at']
self.pr_analysis['first_pr_merged_at'] = first_pr['merged_at']
def return_first_pr(self):
tmp = self.prs.sort_values('created_at')
out = tmp.iloc[0]
del tmp
return out
gh_client = GhApi()
GH = GithubInfo(ghapi_client=gh_client, url=github_url)
GH.pr_analysis
gh_client.rate_limit.get()
"""
Ideas:
- detect anomalies on github project deliveries (missed deadlines, etc..)
Versions
-
PR Overall question:
- First PR merged on date DONE
- Average days between PRs open
- Average days between PRs merged
PR overall questions per week:
- PRs(week)
- Devs(week)
-
Per PR questions:
- average time per PR
- average size per PR
- average contributors per PR
Issues overall per week:
- new issues
- resolved issues
"""
pr_df = pd.DataFrame(GH.prs).sort_values('created_at')
t1 = pd.DataFrame(pd.to_datetime(pr_df['merged_at'])).sort_values('merged_at')
t1['shift1'] = t1['merged_at'].shift(1)
t1.dropna(how='any', inplace=True)
t1['diff'] = t1['merged_at'] - t1['shift1']
t1['diff'] = t1['diff'].apply(lambda x: x.total_seconds())
t1
type(t1.loc[0]['shift1'])
def compute_mtb(df, feature, stat):
"""Computes mean time between"""
tmp = df.sort_values(feature)
tmp = pd.DataFrame(pd.to_datetime(df['merged_at']))
tmp[feature+'_shift1'] = tmp[feature].shift(1)
tmp[feature+'_diff'] = tmp[feature+'_shift1'] - tmp[feature]
if stat == 'mean':
return tmp[feature+'_diff'].mean().total_seconds()
elif stat == 'min':
return tmp[feature+'_diff'].min().total_seconds()
elif stat == 'max':
return tmp[feature+'_diff'].max().total_seconds()
elif stat == 'sd':
return tmp[feature+'_diff'].sdev().total_seconds()
compute_mtb(df=GH.prs, feature='merged_at', stat='mean')
pr_df.iloc[0]