Notebooks >> Scripts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

63 lines
1.9 KiB

import json
import logging
import os
import shutil
from joblib import Parallel, delayed
from selenium.webdriver import Chrome
from .CompanyScraper import CompanyScraper
from .ConnectionScraper import ConnectionScraper
from .ProfileScraper import ProfileScraper
from .utils import HEADLESS_OPTIONS, split_lists
logger = logging.getLogger(__name__)
def scrape_in_parallel(
scraper_type,
items,
output_file,
num_instances,
temp_dir='tmp_data',
driver=Chrome,
driver_options=HEADLESS_OPTIONS,
**kwargs
):
chunked_items = split_lists(items, num_instances)
os.mkdir(temp_dir)
Parallel(n_jobs=num_instances)(delayed(scrape_job)(
scraper_type=scraper_type,
output_file=temp_dir + '/{}.json'.format(i),
items=chunked_items[i],
driver=driver,
driver_options=driver_options,
**kwargs
) for i in range(num_instances))
all_data = {}
for i in range(num_instances):
with open(temp_dir + '/{}.json'.format(i), 'r') as data:
all_data.update(json.load(data))
if output_file:
with open(output_file, 'w') as out:
json.dump(all_data, out)
shutil.rmtree(temp_dir)
return all_data
def scrape_job(scraper_type, items, output_file, **scraper_kwargs):
scraper = scraper_type(**scraper_kwargs)
data = {}
for item in items:
try:
if scraper_type == CompanyScraper:
data[item] = scraper.scrape(company=item).to_dict()
elif scraper_type == ConnectionScraper:
data[item] = scraper.scrape(user=item)
elif scraper_type == ProfileScraper:
data[item] = scraper.scrape(user=item).to_dict()
except Exception as e:
logger.exception("%s could not be scraped: %s", item, e)
with open(output_file, 'w') as out:
json.dump(data, out)