Apache Airflow dags w/ backend configuration bundle.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

70 lines
2.3 KiB

#!/bin/bash
set -e
NOW=$(date +"%Y-%m-%d_%H-%M-%S")
# Loop through arguments and process them
for arg in "$@"
do
case $arg in
-s|--site)
SITE="$2"
shift # Remove argument name from processing
shift # Remove argument value from processing
;;
# *)
-k|--keyword)
KEYWORD="$2"
shift # Remove argument name from processing
shift # Remove argument value from processing
;;
esac
done
cd /data/scripts;
echo "$PWD"
python3 -c "import selenium; print(selenium.__version__)"
# python3 /data/scripts/target/scraper.py -k gitlab
# python3 /data/scripts/linkedin/scraper.py -k gitlab
echo "Searching $SITE for $KEYWORD"
python3 /data/scripts/gather/"$SITE".py -k "$KEYWORD"
echo "List Generated"
echo "Archiving raw JSON"
cp "$SITE".json /data/data/archive/"$SITE"/json/"$NOW".json
cp "$SITE"-blacklist.json /data/data/archive/"$SITE"/json/blacklist-"$NOW".json
echo "Archived /data/data/archive/$SITE/json/$NOW.json"
echo "Archiving raw CSV"
cp "$SITE".csv /data/data/archive/"$SITE"/csv/"$NOW".csv
echo "Archived /data/data/archive/$SITE/csv/$NOW.csv"
echo "Archiving HTML"
cp "$SITE"-page.html /data/data/archive/"$SITE"/html/page-"$NOW".html
cp "$SITE"-snippet.html /data/data/archive/"$SITE"/html/snippet-"$NOW".html
echo "Archived /data/data/archive/$SITE/html/page-$NOW.html"
echo "Archived /data/data/archive/$SITE/html/snippet-$NOW.html"
echo "Archiving screenshots"
cp "$SITE"-scroll.png /data/data/archive/"$SITE"/png/scroll-"$NOW".png
echo "Archived /data/data/archive/$SITE/png/$NOW.png"
echo "Cleanslating"
mv "$SITE"-scroll.png /data/data/archive/"$SITE"/last/"$SITE"-scroll.png
mv "$SITE"-page.html /data/data/archive/"$SITE"/last/"$SITE"-page.html
mv "$SITE"-snippet.html /data/data/archive/"$SITE"/last/"$SITE"-snippet.html
mv "$SITE".csv /data/data/archive/"$SITE"/last/"$SITE".csv
mv "$SITE".json /data/data/archive/"$SITE"/last/"$SITE".json
mv "$SITE"-blacklist.json /data/data/archive/"$SITE"/last/"$SITE"-blacklist.json
echo "Cleanslated"
# cat target.json | sqlite-utils insert --alter /data/data/datasette/zip.db $NOW -
# echo "Adding to master table"
# sqlite-utils insert /data/data/datasette/"$SITE".db "$NOW" "$SITE".json --alter --truncate --pk=id
# echo "Table added"
# sqlite-utils analyze-tables /data/data/datasette/target.db --save