{ "cells": [ { "cell_type": "markdown", "id": "ce111c4e-8e57-439f-888d-30b694c69a9d", "metadata": {}, "source": [ "Scrapers" ] }, { "cell_type": "code", "execution_count": 2, "id": "1f76145c-52ac-4f6a-bd12-3fdc95306486", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting jedi-language-server\n", " Downloading jedi_language_server-0.36.0-py3-none-any.whl (30 kB)\n", "Collecting pydantic<2.0,>=1.7\n", " Downloading pydantic-1.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.2/12.2 MB\u001b[0m \u001b[31m19.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hCollecting pygls<0.12.0,>=0.11.1\n", " Downloading pygls-0.11.3-py3-none-any.whl (86 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.7/86.7 KB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting docstring-to-markdown<1.0.0\n", " Downloading docstring_to_markdown-0.10-py3-none-any.whl (17 kB)\n", "Requirement already satisfied: jedi<0.19.0,>=0.18.0 in /opt/conda/lib/python3.9/site-packages (from jedi-language-server) (0.18.1)\n", "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /opt/conda/lib/python3.9/site-packages (from jedi<0.19.0,>=0.18.0->jedi-language-server) (0.8.3)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.9/site-packages (from pydantic<2.0,>=1.7->jedi-language-server) (4.2.0)\n", "Collecting pydantic<2.0,>=1.7\n", " Downloading pydantic-1.8.2-cp39-cp39-manylinux2014_x86_64.whl (11.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.3/11.3 MB\u001b[0m \u001b[31m22.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hCollecting typeguard<3,>=2.10.0\n", " Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)\n", "Installing collected packages: typeguard, pydantic, docstring-to-markdown, pygls, jedi-language-server\n", "Successfully installed docstring-to-markdown-0.10 jedi-language-server-0.36.0 pydantic-1.8.2 pygls-0.11.3 typeguard-2.13.3\n" ] } ], "source": [ "import sys\n", "\n", "!{sys.executable} -m pip install -U jedi-language-server" ] }, { "cell_type": "code", "execution_count": null, "id": "9b59390e-6f72-4450-9e88-a20f5fc230df", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting seleniumbase\n", " Downloading seleniumbase-2.5.4.post1-py2.py3-none-any.whl (439 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m439.4/439.4 KB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hCollecting scrapy-selenium\n", " Downloading scrapy_selenium-0.0.7-py3-none-any.whl (6.7 kB)\n", "Collecting pytest-ordering==0.6\n", " Downloading pytest_ordering-0.6-py3-none-any.whl (4.6 kB)\n", "Collecting selenium==4.1.3\n", " Downloading selenium-4.1.3-py3-none-any.whl (968 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m968.8/968.8 KB\u001b[0m \u001b[31m17.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hCollecting pytest-html==2.0.1\n", " Downloading pytest_html-2.0.1-py2.py3-none-any.whl (15 kB)\n", "Requirement already satisfied: pip>=22.0.4 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (22.0.4)\n", "Collecting sbvirtualdisplay==1.1.0\n", " Downloading sbvirtualdisplay-1.1.0-py2.py3-none-any.whl (13 kB)\n", "Collecting soupsieve==2.3.2.post1\n", " Using cached soupsieve-2.3.2.post1-py3-none-any.whl (37 kB)\n", "Requirement already satisfied: cffi==1.15.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (1.15.0)\n", "Requirement already satisfied: decorator==5.1.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (5.1.1)\n", "Collecting pluggy==1.0.0\n", " Downloading pluggy-1.0.0-py2.py3-none-any.whl (13 kB)\n", "Collecting more-itertools==8.12.0\n", " Downloading more_itertools-8.12.0-py3-none-any.whl (54 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.3/54.3 KB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: setuptools>=62.1.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (62.1.0)\n", "Requirement already satisfied: charset-normalizer==2.0.12 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.0.12)\n", "Requirement already satisfied: pygments==2.12.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.12.0)\n", "Requirement already satisfied: cssselect==1.1.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (1.1.0)\n", "Collecting pytest==7.1.2\n", " Downloading pytest-7.1.2-py3-none-any.whl (297 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m297.0/297.0 KB\u001b[0m \u001b[31m14.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting trio-websocket==0.9.2\n", " Downloading trio_websocket-0.9.2-py3-none-any.whl (16 kB)\n", "Collecting wsproto==1.1.0\n", " Downloading wsproto-1.1.0-py3-none-any.whl (24 kB)\n", "Collecting nose==1.3.7\n", " Downloading nose-1.3.7-py3-none-any.whl (154 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m154.7/154.7 KB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: requests==2.27.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.27.1)\n", "Requirement already satisfied: matplotlib-inline==0.1.3 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (0.1.3)\n", "Collecting boto==2.49.0\n", " Downloading boto-2.49.0-py2.py3-none-any.whl (1.4 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.4/1.4 MB\u001b[0m \u001b[31m17.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hCollecting toml==0.10.2\n", " Using cached toml-0.10.2-py2.py3-none-any.whl (16 kB)\n", "Requirement already satisfied: jedi==0.18.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (0.18.1)\n", "Collecting chardet==4.0.0\n", " Downloading chardet-4.0.0-py2.py3-none-any.whl (178 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m178.7/178.7 KB\u001b[0m \u001b[31m15.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: tomli>=2.0.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.0.1)\n", "Requirement already satisfied: colorama==0.4.4 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (0.4.4)\n", "Collecting pytest-forked==1.4.0\n", " Downloading pytest_forked-1.4.0-py3-none-any.whl (4.9 kB)\n", "Collecting execnet==1.9.0\n", " Downloading execnet-1.9.0-py2.py3-none-any.whl (39 kB)\n", "Requirement already satisfied: certifi>=2021.10.8 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2021.10.8)\n", "Requirement already satisfied: PyYAML>=6.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (6.0)\n", "Requirement already satisfied: traitlets>=5.1.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (5.1.1)\n", "Requirement already satisfied: attrs>=21.4.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (21.4.0)\n", "Collecting rich==12.3.0\n", " Downloading rich-12.3.0-py3-none-any.whl (232 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m232.1/232.1 KB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting fasteners==0.17.3\n", " Downloading fasteners-0.17.3-py3-none-any.whl (18 kB)\n", "Requirement already satisfied: prompt-toolkit==3.0.29 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (3.0.29)\n", "Collecting pytest-rerunfailures==10.2\n", " Downloading pytest_rerunfailures-10.2-py3-none-any.whl (11 kB)\n", "Collecting parameterized==0.8.1\n", " Downloading parameterized-0.8.1-py2.py3-none-any.whl (26 kB)\n", "Collecting pymysql==1.0.2\n", " Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.8/43.8 KB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: packaging>=21.3 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (21.3)\n", "Requirement already satisfied: platformdirs>=2.5.2 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.5.2)\n", "Requirement already satisfied: sniffio==1.2.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (1.2.0)\n", "Collecting trio==0.20.0\n", " Downloading trio-0.20.0-py3-none-any.whl (359 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m359.0/359.0 KB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting ipython==7.33.0\n", " Downloading ipython-7.33.0-py3-none-any.whl (793 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m793.8/793.8 KB\u001b[0m \u001b[31m15.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hRequirement already satisfied: wheel>=0.37.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (0.37.1)\n", "Collecting setuptools-scm>=6.4.2\n", " Downloading setuptools_scm-6.4.2-py3-none-any.whl (37 kB)\n", "Collecting filelock>=3.6.0\n", " Downloading filelock-3.6.0-py3-none-any.whl (10.0 kB)\n", "Collecting pytest-metadata==2.0.1\n", " Downloading pytest_metadata-2.0.1-py3-none-any.whl (9.9 kB)\n", "Collecting ipdb==0.13.9\n", " Downloading ipdb-0.13.9.tar.gz (16 kB)\n", " Preparing metadata (setup.py) ... \u001b[?25ldone\n", "\u001b[?25hRequirement already satisfied: idna==3.3 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (3.3)\n", "Requirement already satisfied: parso==0.8.3 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (0.8.3)\n", "Collecting py==1.11.0\n", " Downloading py-1.11.0-py2.py3-none-any.whl (98 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.7/98.7 KB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: six==1.16.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (1.16.0)\n", "Requirement already satisfied: beautifulsoup4==4.11.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (4.11.1)\n", "Collecting h11==0.13.0\n", " Downloading h11-0.13.0-py3-none-any.whl (58 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.2/58.2 KB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: pyopenssl==22.0.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (22.0.0)\n", "Collecting cryptography==37.0.1\n", " Downloading cryptography-37.0.1-cp36-abi3-manylinux_2_24_x86_64.whl (4.0 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.0/4.0 MB\u001b[0m \u001b[31m22.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hRequirement already satisfied: sortedcontainers==2.4.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.4.0)\n", "Collecting pytest-xdist==2.5.0\n", " Downloading pytest_xdist-2.5.0-py3-none-any.whl (41 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.7/41.7 KB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: urllib3==1.26.9 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (1.26.9)\n", "Requirement already satisfied: pycparser==2.21 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.21)\n", "Requirement already satisfied: tornado==6.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (6.1)\n", "Collecting pdfminer.six==20220319\n", " Downloading pdfminer.six-20220319-py3-none-any.whl (5.6 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m17.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hCollecting pyotp==2.6.0\n", " Downloading pyotp-2.6.0-py2.py3-none-any.whl (11 kB)\n", "Requirement already satisfied: Pillow==9.1.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (9.1.0)\n", "Requirement already satisfied: pickleshare in /opt/conda/lib/python3.9/site-packages (from ipython==7.33.0->seleniumbase) (0.7.5)\n", "Requirement already satisfied: backcall in /opt/conda/lib/python3.9/site-packages (from ipython==7.33.0->seleniumbase) (0.2.0)\n", "Requirement already satisfied: pexpect>4.3 in /opt/conda/lib/python3.9/site-packages (from ipython==7.33.0->seleniumbase) (4.8.0)\n", "Requirement already satisfied: wcwidth in /opt/conda/lib/python3.9/site-packages (from prompt-toolkit==3.0.29->seleniumbase) (0.2.5)\n", "Collecting iniconfig\n", " Downloading iniconfig-1.1.1-py2.py3-none-any.whl (5.0 kB)\n", "Collecting commonmark<0.10.0,>=0.9.0\n", " Downloading commonmark-0.9.1-py2.py3-none-any.whl (51 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.1/51.1 KB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: async-generator>=1.9 in /opt/conda/lib/python3.9/site-packages (from trio==0.20.0->seleniumbase) (1.10)\n", "Collecting outcome\n", " Downloading outcome-1.1.0-py2.py3-none-any.whl (9.7 kB)\n", "Collecting scrapy>=1.0.0\n", " Downloading Scrapy-2.6.1-py2.py3-none-any.whl (264 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m264.3/264.3 KB\u001b[0m \u001b[31m15.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.9/site-packages (from packaging>=21.3->seleniumbase) (3.0.8)\n", "Collecting protego>=0.1.15\n", " Downloading Protego-0.2.1-py2.py3-none-any.whl (8.2 kB)\n", "Requirement already satisfied: parsel>=1.5.0 in /opt/conda/lib/python3.9/site-packages (from scrapy>=1.0.0->scrapy-selenium) (1.6.0)\n", "Collecting tldextract\n", " Downloading tldextract-3.2.1-py3-none-any.whl (87 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.8/87.8 KB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: lxml>=3.5.0 in /opt/conda/lib/python3.9/site-packages (from scrapy>=1.0.0->scrapy-selenium) (4.8.0)\n", "Collecting Twisted>=17.9.0\n", " Downloading Twisted-22.4.0-py3-none-any.whl (3.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m22.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hCollecting service-identity>=16.0.0\n", " Downloading service_identity-21.1.0-py2.py3-none-any.whl (12 kB)\n", "Requirement already satisfied: w3lib>=1.17.0 in /opt/conda/lib/python3.9/site-packages (from scrapy>=1.0.0->scrapy-selenium) (1.22.0)\n", "Collecting itemadapter>=0.1.0\n", " Downloading itemadapter-0.5.0-py3-none-any.whl (10 kB)\n", "Collecting itemloaders>=1.0.1\n", " Downloading itemloaders-1.0.4-py3-none-any.whl (11 kB)\n", "Collecting queuelib>=1.4.2\n", " Downloading queuelib-1.6.2-py2.py3-none-any.whl (13 kB)\n", "Collecting zope.interface>=4.1.3\n", " Downloading zope.interface-5.4.0-cp39-cp39-manylinux2010_x86_64.whl (255 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m255.8/255.8 KB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting PyDispatcher>=2.0.5\n", " Downloading PyDispatcher-2.0.5.zip (47 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m47.6/47.6 KB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n", "\u001b[?25hCollecting jmespath>=0.9.5\n", " Downloading jmespath-1.0.0-py3-none-any.whl (23 kB)\n", "Requirement already satisfied: ptyprocess>=0.5 in /opt/conda/lib/python3.9/site-packages (from pexpect>4.3->ipython==7.33.0->seleniumbase) (0.7.0)\n", "Collecting pyasn1-modules\n", " Downloading pyasn1_modules-0.2.8-py2.py3-none-any.whl (155 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m155.3/155.3 KB\u001b[0m \u001b[31m14.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting pyasn1\n", " Downloading pyasn1-0.4.8-py2.py3-none-any.whl (77 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.1/77.1 KB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting Automat>=0.8.0\n", " Downloading Automat-20.2.0-py2.py3-none-any.whl (31 kB)\n", "Collecting constantly>=15.1\n", " Downloading constantly-15.1.0-py2.py3-none-any.whl (7.9 kB)\n", "Requirement already satisfied: typing-extensions>=3.6.5 in /opt/conda/lib/python3.9/site-packages (from Twisted>=17.9.0->scrapy>=1.0.0->scrapy-selenium) (4.2.0)\n", "Collecting incremental>=21.3.0\n", " Downloading incremental-21.3.0-py2.py3-none-any.whl (15 kB)\n", "Collecting hyperlink>=17.1.1\n", " Downloading hyperlink-21.0.0-py2.py3-none-any.whl (74 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m74.6/74.6 KB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: PySocks!=1.5.7,<2.0,>=1.5.6 in /opt/conda/lib/python3.9/site-packages (from urllib3==1.26.9->seleniumbase) (1.7.1)\n", "Collecting requests-file>=1.4\n", " Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)\n", "Building wheels for collected packages: ipdb, PyDispatcher\n", " Building wheel for ipdb (setup.py) ... \u001b[?25ldone\n", "\u001b[?25h Created wheel for ipdb: filename=ipdb-0.13.9-py3-none-any.whl size=11646 sha256=4b32fcc05f5ab0b0fa9bb2fdf21ec3d5ae8465b51bb9d106e916a77dda392b14\n", " Stored in directory: /home/jovyan/.cache/pip/wheels/f7/29/9a/cf774cd86e9802f075a0be1c9e0830bc062d07897b2e9e87cd\n", " Building wheel for PyDispatcher (setup.py) ... \u001b[?25ldone\n", "\u001b[?25h Created wheel for PyDispatcher: filename=PyDispatcher-2.0.5-py3-none-any.whl size=11516 sha256=2d98fbbdf2c1959b062f50876512ac0499e2ceacddd48fd2cda2600024aeb3d7\n", " Stored in directory: /home/jovyan/.cache/pip/wheels/a5/de/8a/4b52190a95d99c042ec6bd5ad2de3a3c1b5ce71d69f0bbd036\n", "Successfully built ipdb PyDispatcher\n", "Installing collected packages: pyotp, PyDispatcher, pyasn1, parameterized, nose, iniconfig, incremental, constantly, commonmark, boto, zope.interface, toml, soupsieve, sbvirtualdisplay, rich, queuelib, pymysql, pyasn1-modules, py, protego, pluggy, outcome, more-itertools, jmespath, itemadapter, hyperlink, h11, filelock, fasteners, execnet, chardet, Automat, wsproto, Twisted, trio, setuptools-scm, requests-file, pytest, ipython, cryptography, trio-websocket, tldextract, service-identity, pytest-rerunfailures, pytest-ordering, pytest-metadata, pytest-forked, pdfminer.six, itemloaders, ipdb, scrapy, pytest-xdist, pytest-html, selenium, seleniumbase, scrapy-selenium\n", " Attempting uninstall: soupsieve\n", " Found existing installation: soupsieve 2.3.1\n", " Uninstalling soupsieve-2.3.1:\n", " Successfully uninstalled soupsieve-2.3.1\n", " Attempting uninstall: ipython\n", " Found existing installation: ipython 8.3.0\n", " Uninstalling ipython-8.3.0:\n", " Successfully uninstalled ipython-8.3.0\n", " Attempting uninstall: cryptography\n", " Found existing installation: cryptography 36.0.2\n", " Uninstalling cryptography-36.0.2:\n", " Successfully uninstalled cryptography-36.0.2\n", "Successfully installed Automat-20.2.0 PyDispatcher-2.0.5 Twisted-22.4.0 boto-2.49.0 chardet-4.0.0 commonmark-0.9.1 constantly-15.1.0 cryptography-37.0.1 execnet-1.9.0 fasteners-0.17.3 filelock-3.6.0 h11-0.13.0 hyperlink-21.0.0 incremental-21.3.0 iniconfig-1.1.1 ipdb-0.13.9 ipython-7.33.0 itemadapter-0.5.0 itemloaders-1.0.4 jmespath-1.0.0 more-itertools-8.12.0 nose-1.3.7 outcome-1.1.0 parameterized-0.8.1 pdfminer.six-20220319 pluggy-1.0.0 protego-0.2.1 py-1.11.0 pyasn1-0.4.8 pyasn1-modules-0.2.8 pymysql-1.0.2 pyotp-2.6.0 pytest-7.1.2 pytest-forked-1.4.0 pytest-html-2.0.1 pytest-metadata-2.0.1 pytest-ordering-0.6 pytest-rerunfailures-10.2 pytest-xdist-2.5.0 queuelib-1.6.2 requests-file-1.5.1 rich-12.3.0 sbvirtualdisplay-1.1.0 scrapy-2.6.1 scrapy-selenium-0.0.7 selenium-4.1.3 seleniumbase-2.5.4.post1 service-identity-21.1.0 setuptools-scm-6.4.2 soupsieve-2.3.2.post1 tldextract-3.2.1 toml-0.10.2 trio-0.20.0 trio-websocket-0.9.2 wsproto-1.1.0 zope.interface-5.4.0\n", "Proceed (Y/n)? " ] } ], "source": [ "import sys\n", "\n", "!{sys.executable} -m pip install seleniumbase scrapy-selenium\n", "\n", "!{sys.executable} -m pip uninstall -q selenium\n", "!{sys.executable} -m pip install --user selenium==4.0.0" ] }, { "cell_type": "code", "execution_count": null, "id": "289368e8", "metadata": {}, "outputs": [], "source": [ "# !pip install scrapy-selenium\n", "\n", "!python scrapy/learn/awesome_selfhosted/spiders/scrapy_selenium.py" ] }, { "cell_type": "code", "execution_count": null, "id": "b98dc06b-c523-49a5-8202-8eaa4504da02", "metadata": {}, "outputs": [], "source": [ "!https://docs.scrapy.org/_/downloads/en/latest/pdf/" ] }, { "cell_type": "code", "execution_count": null, "id": "ceb94f3e-047d-4902-bcd7-9e58e3d47ed7", "metadata": {}, "outputs": [], "source": [ "import sys\n", "\n", "!{sys.executable} -m pip install scrapy-pyppeteer" ] }, { "cell_type": "code", "execution_count": null, "id": "f246228d-904e-475b-a192-1e2e2a7b6914", "metadata": { "tags": [] }, "outputs": [], "source": [ "!python scrapy/scrapy-linkedin/scrapy-linkedin/spiders/linkedin_login.py" ] }, { "cell_type": "code", "execution_count": 13, "id": "facad22a", "metadata": { "execution": { "iopub.execute_input": "2021-10-16T00:29:30.210663Z", "iopub.status.busy": "2021-10-16T00:29:30.210346Z", "iopub.status.idle": "2021-10-16T00:29:33.121592Z", "shell.execute_reply": "2021-10-16T00:29:33.119808Z", "shell.execute_reply.started": "2021-10-16T00:29:30.210632Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2021-10-16 00:29:31 [scrapy.utils.log] INFO: Scrapy 2.5.1 started (bot: scrapybot)\n", "2021-10-16 00:29:31 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.9.7 | packaged by conda-forge | (default, Sep 23 2021, 07:28:37) - [GCC 9.4.0], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l 24 Aug 2021), cryptography 35.0.0, Platform Linux-5.4.0-88-generic-x86_64-with-glibc2.31\n", "2021-10-16 00:29:31 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor\n", "2021-10-16 00:29:31 [scrapy.crawler] INFO: Overridden settings:\n", "{}\n", "2021-10-16 00:29:31 [scrapy.extensions.telnet] INFO: Telnet Password: c3ecb039fe45254e\n", "2021-10-16 00:29:31 [scrapy.middleware] INFO: Enabled extensions:\n", "['scrapy.extensions.corestats.CoreStats',\n", " 'scrapy.extensions.telnet.TelnetConsole',\n", " 'scrapy.extensions.memusage.MemoryUsage',\n", " 'scrapy.extensions.logstats.LogStats']\n", "2021-10-16 00:29:31 [scrapy.middleware] INFO: Enabled downloader middlewares:\n", "['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',\n", " 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',\n", " 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',\n", " 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',\n", " 'scrapy.downloadermiddlewares.retry.RetryMiddleware',\n", " 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',\n", " 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',\n", " 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',\n", " 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',\n", " 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',\n", " 'scrapy.downloadermiddlewares.stats.DownloaderStats']\n", "2021-10-16 00:29:31 [scrapy.middleware] INFO: Enabled spider middlewares:\n", "['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',\n", " 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',\n", " 'scrapy.spidermiddlewares.referer.RefererMiddleware',\n", " 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',\n", " 'scrapy.spidermiddlewares.depth.DepthMiddleware']\n", "2021-10-16 00:29:31 [scrapy.middleware] INFO: Enabled item pipelines:\n", "[]\n", "2021-10-16 00:29:31 [scrapy.core.engine] INFO: Spider opened\n", "2021-10-16 00:29:31 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)\n", "2021-10-16 00:29:31 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023\n", "2021-10-16 00:29:31 [py.warnings] WARNING: /opt/conda/lib/python3.9/site-packages/scrapy_splash/request.py:42: ScrapyDeprecationWarning: Call to deprecated function to_native_str. Use to_unicode instead.\n", " url = to_native_str(url)\n", "\n", "2021-10-16 00:29:31 [scrapy.core.engine] DEBUG: Crawled (200) (referer: None)\n", "2021-10-16 00:29:32 [scrapy.core.engine] INFO: Closing spider (finished)\n", "2021-10-16 00:29:32 [scrapy.statscollectors] INFO: Dumping Scrapy stats:\n", "{'downloader/request_bytes': 251,\n", " 'downloader/request_count': 1,\n", " 'downloader/request_method_count/GET': 1,\n", " 'downloader/response_bytes': 125130,\n", " 'downloader/response_count': 1,\n", " 'downloader/response_status_count/200': 1,\n", " 'elapsed_time_seconds': 1.484227,\n", " 'finish_reason': 'finished',\n", " 'finish_time': datetime.datetime(2021, 10, 16, 0, 29, 32, 805803),\n", " 'httpcompression/response_bytes': 637769,\n", " 'httpcompression/response_count': 1,\n", " 'log_count/DEBUG': 1,\n", " 'log_count/INFO': 10,\n", " 'log_count/WARNING': 1,\n", " 'memusage/max': 61575168,\n", " 'memusage/startup': 61575168,\n", " 'response_received_count': 1,\n", " 'scheduler/dequeued': 1,\n", " 'scheduler/dequeued/memory': 1,\n", " 'scheduler/enqueued': 1,\n", " 'scheduler/enqueued/memory': 1,\n", " 'start_time': datetime.datetime(2021, 10, 16, 0, 29, 31, 321576)}\n", "2021-10-16 00:29:32 [scrapy.core.engine] INFO: Spider closed (finished)\n" ] } ], "source": [ "!python scrapy/github/awesome/spiders/awesome_list.py" ] }, { "cell_type": "code", "execution_count": 1, "id": "2f3cbfcf", "metadata": {}, "outputs": [], "source": [ "import requests\n", "\n", "r = requests.get(\n", " \"http://splash:8050/render.json?url=http://domain.com/page-with-javascript.html&timeout=10&wait=0.5\"\n", ")" ] }, { "cell_type": "code", "execution_count": 2, "id": "2a7ed7d2-d379-489e-9679-ab818c31a395", "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "'Response' object has no attribute 'status'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipykernel_387/4275213714.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstatus\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m: 'Response' object has no attribute 'status'" ] } ], "source": [ "print(r.status)" ] }, { "cell_type": "code", "execution_count": null, "id": "7aa9bbb4-f83e-4fb9-be62-e10d361177d2", "metadata": {}, "outputs": [], "source": [ "!python scrapy/github/awesome_selfhosted/spiders/quotes.py" ] }, { "cell_type": "markdown", "id": "29fb8d4b-32ee-4b16-bd6b-00e042aa807e", "metadata": {}, "source": [ "# Scrapy Integration Testing" ] }, { "cell_type": "code", "execution_count": 11, "id": "9027f1aa-a239-46f7-bf4f-04f50b11120d", "metadata": { "execution": { "iopub.execute_input": "2021-10-16T00:25:34.937091Z", "iopub.status.busy": "2021-10-16T00:25:34.936806Z", "iopub.status.idle": "2021-10-16T00:25:36.936409Z", "shell.execute_reply": "2021-10-16T00:25:36.935417Z", "shell.execute_reply.started": "2021-10-16T00:25:34.937066Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2021-10-16 00:25:35 [scrapy.utils.log] INFO: Scrapy 2.5.1 started (bot: scrapybot)\n", "2021-10-16 00:25:35 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.9.7 | packaged by conda-forge | (default, Sep 23 2021, 07:28:37) - [GCC 9.4.0], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l 24 Aug 2021), cryptography 35.0.0, Platform Linux-5.4.0-88-generic-x86_64-with-glibc2.31\n", "2021-10-16 00:25:35 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor\n", "2021-10-16 00:25:35 [scrapy.crawler] INFO: Overridden settings:\n", "{}\n", "2021-10-16 00:25:35 [scrapy.extensions.telnet] INFO: Telnet Password: b8a082d3f20f441c\n", "2021-10-16 00:25:35 [scrapy.middleware] INFO: Enabled extensions:\n", "['scrapy.extensions.corestats.CoreStats',\n", " 'scrapy.extensions.telnet.TelnetConsole',\n", " 'scrapy.extensions.memusage.MemoryUsage',\n", " 'scrapy.extensions.logstats.LogStats']\n", "2021-10-16 00:25:35 [scrapy.middleware] INFO: Enabled downloader middlewares:\n", "['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',\n", " 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',\n", " 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',\n", " 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',\n", " 'scrapy.downloadermiddlewares.retry.RetryMiddleware',\n", " 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',\n", " 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',\n", " 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',\n", " 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',\n", " 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',\n", " 'scrapy.downloadermiddlewares.stats.DownloaderStats']\n", "2021-10-16 00:25:35 [scrapy.middleware] INFO: Enabled spider middlewares:\n", "['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',\n", " 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',\n", " 'scrapy.spidermiddlewares.referer.RefererMiddleware',\n", " 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',\n", " 'scrapy.spidermiddlewares.depth.DepthMiddleware']\n", "2021-10-16 00:25:35 [scrapy.middleware] INFO: Enabled item pipelines:\n", "[]\n", "2021-10-16 00:25:35 [scrapy.core.engine] INFO: Spider opened\n", "2021-10-16 00:25:35 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)\n", "2021-10-16 00:25:36 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023\n", "2021-10-16 00:25:36 [py.warnings] WARNING: /opt/conda/lib/python3.9/site-packages/scrapy_splash/request.py:42: ScrapyDeprecationWarning: Call to deprecated function to_native_str. Use to_unicode instead.\n", " url = to_native_str(url)\n", "\n", "2021-10-16 00:25:36 [scrapy.core.engine] DEBUG: Crawled (200) (referer: None)\n", "2021-10-16 00:25:36 [scrapy.core.scraper] ERROR: Spider error processing (referer: None)\n", "Traceback (most recent call last):\n", " File \"/opt/conda/lib/python3.9/site-packages/twisted/internet/defer.py\", line 858, in _runCallbacks\n", " current.result = callback( # type: ignore[misc]\n", " File \"/home/jovyan/code/scrapy/github/awesome/spiders/splash_test.py\", line 29, in parse\n", " png_bytes = base64.b64decode(response.data['png'])\n", "AttributeError: 'HtmlResponse' object has no attribute 'data'\n", "2021-10-16 00:25:36 [scrapy.core.engine] INFO: Closing spider (finished)\n", "2021-10-16 00:25:36 [scrapy.statscollectors] INFO: Dumping Scrapy stats:\n", "{'downloader/request_bytes': 232,\n", " 'downloader/request_count': 1,\n", " 'downloader/request_method_count/GET': 1,\n", " 'downloader/response_bytes': 30168,\n", " 'downloader/response_count': 1,\n", " 'downloader/response_status_count/200': 1,\n", " 'elapsed_time_seconds': 0.689888,\n", " 'finish_reason': 'finished',\n", " 'finish_time': datetime.datetime(2021, 10, 16, 0, 25, 36, 688102),\n", " 'httpcompression/response_bytes': 208226,\n", " 'httpcompression/response_count': 1,\n", " 'log_count/DEBUG': 1,\n", " 'log_count/ERROR': 1,\n", " 'log_count/INFO': 10,\n", " 'log_count/WARNING': 1,\n", " 'memusage/max': 59232256,\n", " 'memusage/startup': 59232256,\n", " 'response_received_count': 1,\n", " 'scheduler/dequeued': 1,\n", " 'scheduler/dequeued/memory': 1,\n", " 'scheduler/enqueued': 1,\n", " 'scheduler/enqueued/memory': 1,\n", " 'spider_exceptions/AttributeError': 1,\n", " 'start_time': datetime.datetime(2021, 10, 16, 0, 25, 35, 998214)}\n", "2021-10-16 00:25:36 [scrapy.core.engine] INFO: Spider closed (finished)\n" ] } ], "source": [ "!python scrapy/github/awesome/spiders/splash_test.py" ] }, { "cell_type": "code", "execution_count": 12, "id": "1a956486-2814-440d-a844-3be8769f4d74", "metadata": { "execution": { "iopub.execute_input": "2021-10-16T00:28:08.970348Z", "iopub.status.busy": "2021-10-16T00:28:08.970102Z", "iopub.status.idle": "2021-10-16T00:28:10.849795Z", "shell.execute_reply": "2021-10-16T00:28:10.848508Z", "shell.execute_reply.started": "2021-10-16T00:28:08.970324Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2021-10-16 00:28:09 [scrapy.utils.log] INFO: Scrapy 2.5.1 started (bot: scrapybot)\n", "2021-10-16 00:28:09 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.9.7 | packaged by conda-forge | (default, Sep 23 2021, 07:28:37) - [GCC 9.4.0], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l 24 Aug 2021), cryptography 35.0.0, Platform Linux-5.4.0-88-generic-x86_64-with-glibc2.31\n", "2021-10-16 00:28:09 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor\n", "2021-10-16 00:28:09 [scrapy.crawler] INFO: Overridden settings:\n", "{}\n", "2021-10-16 00:28:09 [scrapy.extensions.telnet] INFO: Telnet Password: 5715398da4f12340\n", "2021-10-16 00:28:09 [scrapy.middleware] INFO: Enabled extensions:\n", "['scrapy.extensions.corestats.CoreStats',\n", " 'scrapy.extensions.telnet.TelnetConsole',\n", " 'scrapy.extensions.memusage.MemoryUsage',\n", " 'scrapy.extensions.logstats.LogStats']\n", "2021-10-16 00:28:09 [scrapy.middleware] WARNING: Disabled SeleniumMiddleware: SELENIUM_DRIVER_NAME and SELENIUM_DRIVER_EXECUTABLE_PATH must be set\n", "2021-10-16 00:28:09 [scrapy.middleware] INFO: Enabled downloader middlewares:\n", "['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',\n", " 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',\n", " 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',\n", " 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',\n", " 'scrapy.downloadermiddlewares.retry.RetryMiddleware',\n", " 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',\n", " 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',\n", " 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',\n", " 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',\n", " 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',\n", " 'scrapy.downloadermiddlewares.stats.DownloaderStats']\n", "2021-10-16 00:28:09 [scrapy.middleware] INFO: Enabled spider middlewares:\n", "['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',\n", " 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',\n", " 'scrapy.spidermiddlewares.referer.RefererMiddleware',\n", " 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',\n", " 'scrapy.spidermiddlewares.depth.DepthMiddleware']\n", "2021-10-16 00:28:09 [scrapy.middleware] INFO: Enabled item pipelines:\n", "[]\n", "2021-10-16 00:28:09 [scrapy.core.engine] INFO: Spider opened\n", "2021-10-16 00:28:09 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)\n", "2021-10-16 00:28:10 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023\n", "2021-10-16 00:28:10 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to from \n", "2021-10-16 00:28:10 [scrapy.core.engine] DEBUG: Crawled (200) (referer: None)\n", "2021-10-16 00:28:10 [scrapy.core.scraper] ERROR: Spider error processing (referer: None)\n", "Traceback (most recent call last):\n", " File \"/opt/conda/lib/python3.9/site-packages/twisted/internet/defer.py\", line 858, in _runCallbacks\n", " current.result = callback( # type: ignore[misc]\n", " File \"/home/jovyan/code/scrapy/github/awesome/spiders/selenium_test.py\", line 29, in parse_result\n", " print(response.request.meta['driver'].title)\n", "KeyError: 'driver'\n", "2021-10-16 00:28:10 [scrapy.core.engine] INFO: Closing spider (finished)\n", "2021-10-16 00:28:10 [scrapy.statscollectors] INFO: Dumping Scrapy stats:\n", "{'downloader/request_bytes': 432,\n", " 'downloader/request_count': 2,\n", " 'downloader/request_method_count/GET': 2,\n", " 'downloader/response_bytes': 7820,\n", " 'downloader/response_count': 2,\n", " 'downloader/response_status_count/200': 1,\n", " 'downloader/response_status_count/301': 1,\n", " 'elapsed_time_seconds': 0.602981,\n", " 'finish_reason': 'finished',\n", " 'finish_time': datetime.datetime(2021, 10, 16, 0, 28, 10, 602166),\n", " 'httpcompression/response_bytes': 14104,\n", " 'httpcompression/response_count': 1,\n", " 'log_count/DEBUG': 2,\n", " 'log_count/ERROR': 1,\n", " 'log_count/INFO': 10,\n", " 'log_count/WARNING': 1,\n", " 'memusage/max': 56184832,\n", " 'memusage/startup': 56184832,\n", " 'response_received_count': 1,\n", " 'scheduler/dequeued': 2,\n", " 'scheduler/dequeued/memory': 2,\n", " 'scheduler/enqueued': 2,\n", " 'scheduler/enqueued/memory': 2,\n", " 'spider_exceptions/KeyError': 1,\n", " 'start_time': datetime.datetime(2021, 10, 16, 0, 28, 9, 999185)}\n", "2021-10-16 00:28:10 [scrapy.core.engine] INFO: Spider closed (finished)\n" ] } ], "source": [ "!python scrapy/github/awesome/spiders/selenium_test.py" ] }, { "cell_type": "code", "execution_count": null, "id": "168fad05-89c9-4d48-9b04-8d1f75be4745", "metadata": {}, "outputs": [], "source": [ "from utils.browser.test import *\n", "\n", "AnonymousBrowserTest()\n", "\n", "!python utils/browser/test.py" ] }, { "cell_type": "markdown", "id": "cd7bf7a6-3bfa-4577-ab18-7f21d9564e31", "metadata": {}, "source": [ "Testing" ] }, { "cell_type": "code", "execution_count": null, "id": "5369ec44-81f0-4875-a74f-419c7ab9e5dd", "metadata": {}, "outputs": [], "source": [ "from utils.browser.network import *\n", "\n", "chrome_options = webdriver.ChromeOptions()\n", "\n", "# capabilities = browser_capabilities(module)\n", "\n", "print(\"Webdriver Initializing\")\n", "\n", "driver = webdriver.Remote(\n", " command_executor=\"http://192.168.1.101:4444/wd/hub\",\n", " options=chrome_options,\n", " # desired_capabilities=capabilities,\n", ")\n", "\n", "\n", "log.info(\"Proxy IP info,,,\")\n", "\n", "proxy_ip(\"data/proxy_ip.json\")\n", "\n", "log.info(\"VPN Settings...\")\n", "\n", "vpn_settings(\"data/vpn.json\")\n", "\n", "log.info(\"Selenoid Status...\")\n", "\n", "selenoid_status(\"data/selenoid_status.json\")\n", "\n", "log.info(\"Real IP...\")\n", "\n", "ip_status(driver)\n", "\n", "log.info(\"Browser Settings...\")\n", "\n", "browser_config(driver)\n", "\n", "log.info(\"Random User Agent...\")\n", "\n", "# user_agent(driver)\n", "\n", "# driver_object(driver)\n", "\n", "# browser_feature(driver)\n", "\n", "# uc_test()\n", "\n", "driver.quit()\n", "\n", "log.info(\"Session Terminated!\")\n", "\n", "log.info(\"test Finished\")\n", "# return" ] }, { "cell_type": "code", "execution_count": 10, "id": "cfad181f-0f1e-43b2-9479-df868e380e16", "metadata": { "execution": { "iopub.execute_input": "2021-10-16T00:23:23.049733Z", "iopub.status.busy": "2021-10-16T00:23:23.049274Z", "iopub.status.idle": "2021-10-16T00:23:23.187517Z", "shell.execute_reply": "2021-10-16T00:23:23.182088Z", "shell.execute_reply.started": "2021-10-16T00:23:23.049695Z" }, "tags": [] }, "outputs": [ { "ename": "WebDriverException", "evalue": "Message: \n\n\n\nError\n\n\n
Cannot POST /session
\n\n\n\n", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mWebDriverException\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipykernel_111/2032127904.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mchrome_options\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwebdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mChromeOptions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m driver = webdriver.Remote(\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mcommand_executor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"http://192.168.1.101:3148\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moptions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchrome_options\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m )\n", "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, command_executor, desired_capabilities, browser_profile, proxy, keep_alive, file_detector, options)\u001b[0m\n\u001b[1;32m 155\u001b[0m warnings.warn(\"Please use FirefoxOptions to set browser profile\",\n\u001b[1;32m 156\u001b[0m DeprecationWarning, stacklevel=2)\n\u001b[0;32m--> 157\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart_session\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcapabilities\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbrowser_profile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 158\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_switch_to\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSwitchTo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_mobile\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mMobile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py\u001b[0m in \u001b[0;36mstart_session\u001b[0;34m(self, capabilities, browser_profile)\u001b[0m\n\u001b[1;32m 250\u001b[0m parameters = {\"capabilities\": w3c_caps,\n\u001b[1;32m 251\u001b[0m \"desiredCapabilities\": capabilities}\n\u001b[0;32m--> 252\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mCommand\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mNEW_SESSION\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparameters\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 253\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'sessionId'\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 254\u001b[0m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'value'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py\u001b[0m in \u001b[0;36mexecute\u001b[0;34m(self, driver_command, params)\u001b[0m\n\u001b[1;32m 319\u001b[0m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommand_executor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdriver_command\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 320\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 321\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merror_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcheck_response\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 322\u001b[0m response['value'] = self._unwrap_value(\n\u001b[1;32m 323\u001b[0m response.get('value', None))\n", "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/selenium/webdriver/remote/errorhandler.py\u001b[0m in \u001b[0;36mcheck_response\u001b[0;34m(self, response)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mexception_class\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mErrorInResponseException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 208\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 209\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmessage\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"\"\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m'message'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0mmessage\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'message'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mWebDriverException\u001b[0m: Message: \n\n\n\nError\n\n\n
Cannot POST /session
\n\n\n\n" ] } ], "source": [ "from linkedin_scraper import Person, actions\n", "from selenium import webdriver\n", "\n", "chrome_options = webdriver.ChromeOptions()\n", "driver = webdriver.Remote(\n", " command_executor=\"http://192.168.1.101:3148\", options=chrome_options\n", ")\n", "\n", "email = \"aldrich.don@gmail.com\"\n", "password = os.getenv(password)\n", "actions.login(\n", " driver, email, password\n", ") # if email and password isnt given, it'll prompt in terminal\n", "person = Person(\"https://www.linkedin.com/in/don-aldrich-48a51815\", driver=driver)\n", "print(person)\n", "# driver.get(\"https://www.linkedin.com/in/don-aldrich-48a51815\")\n", "# driver.save_screenshot('./linkedin.png')\n", "# driver.quit()\n", "# driver = webdriver.Chrome()" ] }, { "cell_type": "code", "execution_count": null, "id": "bc65546e-a896-472c-ad49-e1258a885fd2", "metadata": { "tags": [] }, "outputs": [], "source": [ "import sys\n", "\n", "!{sys.executable} -m pip install linkedin_scraper" ] }, { "cell_type": "code", "execution_count": 3, "id": "8d87b011-9c30-43ff-b259-a93fee86055a", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'widgets' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipykernel_387/914693869.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m out = widgets.FloatProgress(\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m7.5\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mmin\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mmax\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10.0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mstep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'widgets' is not defined" ] } ], "source": [ "out = widgets.FloatProgress(\n", " value=7.5,\n", " min=0,\n", " max=10.0,\n", " step=0.1,\n", " description=\"Loading:\",\n", " bar_style=\"info\",\n", " orientation=\"horizontal\",\n", ")\n", "out" ] }, { "cell_type": "code", "execution_count": 4, "id": "587faeda-7336-42fd-bf2a-b389edb20252", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'widgets' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipykernel_387/1437867774.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwidgets\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mOutput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlayout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'border'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'1px solid black'\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'widgets' is not defined" ] } ], "source": [ "out = widgets.Output(layout={\"border\": \"1px solid black\"})\n", "out" ] }, { "cell_type": "code", "execution_count": null, "id": "fe9848c5-c1c2-4adf-b879-b3afd0ebae5e", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }