Source code for torlib.crawler.github_crawler

import multiprocessing as mp
from tqdm import tqdm
import os
import time
import requests
import json
from pathlib import Path
import responses


class NoTokenError(Exception):
    """Raised when input list of github token is empty

    Attributes:
        message (string): explanation of the error
    """

    def __init__(self, message="please input github token"):
        self.message = message
        super().__init__(self.message)

    def __str__(self):
        return f'{self.message}'


class LengthNotMatchError(Exception):
    """Raised when the length of savename and url is not the same

    Attributes:
        savename (list): input list that cause error
        url (list): input list that cause error
        message (string): explanation of the error
    """

    def __init__(self, savename, url, message="savename and url must have same length"):
        self.length_savename = len(savename)
        self.length_url = len(url)
        self.message = message
        super().__init__(self.message)

    def __str__(self):
        return f'len(savename)={self.length_savename} len(url)={self.length_url} -> {self.message}'


class InputNotStringError(Exception):
    """Raised when not all of member in savename or url are string

    Attributes:
        error_list_name (string): name of input list that cause error
        message (string): explanation of the error
    """

    def __init__(self, error_list_name, message="savename or url must contain only string"):
        self.error_list_name = error_list_name
        self.message = message
        super().__init__(self.message)

    def __str__(self):
        return f'{self.error_list_name} -> {self.message}'


[docs]def github_crawler_multipage(savename, url, GHtoken, retry=3, pc=1, log_file='github_crawler_log.txt', output_dir='', for_test=False, pretty_json=True): """ crawl the github api and save file to json this function will also generate the log file that show the url of api that cannot be crawled Args: savename (list): contain string of the save file name (need to be same length as url) url (list): contain string of url of the target api (need to be same length as savename) GHtoken (list): list of github token retry (int, optional): number of time to retry crawling the fail case. Defaults to 3. pc (int, optional): number of process for multiprocessing. Defaults to 1. log_file (str, optional): name of the log file showing the detail of fail case. Defaults to 'github_crawler_log.txt'. output_dir (str, optional): output directory. Defaults to ''. for_test (boolean, optional): used for testing or not. Defaults to False. pretty_json (boolean, optional): make to output json file easier to read. Defaults to True. Raises: LengthNotMatchError: Raised when the length of savename and url is not the same InputNotStringError: Raised when not all of member in savename or url are string NoTokenError: Raised when input list of github token is empty """ # check the size of savename and url if len(savename) != len(url): raise LengthNotMatchError(savename, url) # check type of member of savename and url for i in savename: if type(i) != str: raise InputNotStringError('savename') for i in url: if type(i) != str: raise InputNotStringError('url') # check if github token is empty if len(GHtoken) == 0: raise NoTokenError() # if savename and url length is 0 do nothing if len(savename) == 0 and len(url) == 0: return # create path to output_dir if not exist Path(output_dir).mkdir(parents=True, exist_ok=True) # prepared parameter for collect_json_multipage save_path = [os.path.join(output_dir, f'{sn}.json') for sn in savename] list_to_crawl = list(zip(save_path, url)) for i in range(len(list_to_crawl)): list_to_crawl[i] = list_to_crawl[i] + \ (GHtoken[i % len(GHtoken)], pretty_json,) count_try = 0 complete = False while count_try < retry and not complete: print(f'{count_try+1} attempt to crawl {len(list_to_crawl)} url') count_try = count_try+1 # create pool for multiprocessing with mp.Pool(pc) as p: multi_out = tqdm(p.imap(__collect_json_multipage if not for_test else __collect_json_multipage_for_testing, list_to_crawl, chunksize=1), total=len(list_to_crawl)) result = [i for i in multi_out] # check if all of url is complete and list only fail url complete = True remain_list_to_crawl = [] for i in range(len(result)): url, is_success = result[i] if is_success != 1: complete = False remain_list_to_crawl.append(list_to_crawl[i]) list_to_crawl = remain_list_to_crawl print(list_to_crawl) # list url that cannot be crawled fail_list = [(url, is_success) for url, is_success in result if is_success != 1] with open(log_file, 'w') as outfile: if pretty_json: json.dump(fail_list, outfile, indent=4) else: json.dump(fail_list, outfile)
def __collect_json_multipage(input_tuple): """ save response from request as json file Args: input_tuple (tuple): tuple contain three variables: - save_path (string) save file name with path url (string) - url of the api GHtoken (string) - Github Token - pretty_json (boolean) - make to output json file easier to read Returns: tuple: (url, result) showing status of the crawling result will be 1 if sucess otherwise it will be the exception message of the error that occur """ save_path, url, GHtoken, pretty_json = input_tuple # if the file is exist proceed to next one if os.path.exists(save_path): return (url, 1) page = 1 stop_flag = False # track last page result_json = [] try: while not stop_flag: r = requests.get(url+'?per_page=100&page='+str(page), headers={'Authorization': 'token '+GHtoken}) # if ratelimit is not remain wait until rate limit reset and try again if int(r.headers['X-RateLimit-Remaining']) <= 0: current_time = time.time() left_time = current_time - int(r.headers['X-RateLimit-Reset']) while(left_time < 0): time.sleep(10) current_time = time.time() left_time = current_time - \ int(r.headers['X-RateLimit-Reset']) r = requests.get(url+'?per_page=100&page='+str(page), headers={'Authorization': 'token '+GHtoken}) json_r = r.json() if type(json_r) == dict: result_json.append(json_r) else: result_json.extend(json_r) page = page+1 # stop if last page if 'link' not in r.headers or 'rel="last"' not in r.headers['link']: stop_flag = True # save json file with open(save_path, 'w') as outfile: if pretty_json: json.dump(result_json, outfile, indent=4) else: json.dump(result_json, outfile) # return error except Exception as e: return (url, str(e)) return (url, 1) @responses.activate def __collect_json_multipage_for_testing(input_tuple): """ function used for testing only""" responses.add(responses.GET, 'http://test_github/api/1?per_page=100&page=1', status=200, content_type='application/json', headers={'X-RateLimit-Remaining': '100000'}, body='{"test": "test1"}') responses.add(responses.GET, 'http://test_github/api/2?per_page=100&page=1', status=200, content_type='application/json', headers={'X-RateLimit-Remaining': '100000'}, body='{"test": "test2"}') responses.add(responses.GET, 'http://test_github/api/3?per_page=100&page=1', status=200, content_type='application/json', headers={'X-RateLimit-Remaining': '100000'}, body='{"test": "test3"}') return __collect_json_multipage(input_tuple)