Notebooks >> Scripts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

895 lines
58 KiB

{
"cells": [
{
"cell_type": "markdown",
"id": "ce111c4e-8e57-439f-888d-30b694c69a9d",
"metadata": {},
"source": [
"Scrapers"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "1f76145c-52ac-4f6a-bd12-3fdc95306486",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting jedi-language-server\n",
" Downloading jedi_language_server-0.36.0-py3-none-any.whl (30 kB)\n",
"Collecting pydantic<2.0,>=1.7\n",
" Downloading pydantic-1.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.2/12.2 MB\u001b[0m \u001b[31m19.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting pygls<0.12.0,>=0.11.1\n",
" Downloading pygls-0.11.3-py3-none-any.whl (86 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.7/86.7 KB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting docstring-to-markdown<1.0.0\n",
" Downloading docstring_to_markdown-0.10-py3-none-any.whl (17 kB)\n",
"Requirement already satisfied: jedi<0.19.0,>=0.18.0 in /opt/conda/lib/python3.9/site-packages (from jedi-language-server) (0.18.1)\n",
"Requirement already satisfied: parso<0.9.0,>=0.8.0 in /opt/conda/lib/python3.9/site-packages (from jedi<0.19.0,>=0.18.0->jedi-language-server) (0.8.3)\n",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.9/site-packages (from pydantic<2.0,>=1.7->jedi-language-server) (4.2.0)\n",
"Collecting pydantic<2.0,>=1.7\n",
" Downloading pydantic-1.8.2-cp39-cp39-manylinux2014_x86_64.whl (11.3 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.3/11.3 MB\u001b[0m \u001b[31m22.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting typeguard<3,>=2.10.0\n",
" Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)\n",
"Installing collected packages: typeguard, pydantic, docstring-to-markdown, pygls, jedi-language-server\n",
"Successfully installed docstring-to-markdown-0.10 jedi-language-server-0.36.0 pydantic-1.8.2 pygls-0.11.3 typeguard-2.13.3\n"
]
}
],
"source": [
"import sys\n",
"\n",
"!{sys.executable} -m pip install -U jedi-language-server"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9b59390e-6f72-4450-9e88-a20f5fc230df",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting seleniumbase\n",
" Downloading seleniumbase-2.5.4.post1-py2.py3-none-any.whl (439 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m439.4/439.4 KB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
"\u001b[?25hCollecting scrapy-selenium\n",
" Downloading scrapy_selenium-0.0.7-py3-none-any.whl (6.7 kB)\n",
"Collecting pytest-ordering==0.6\n",
" Downloading pytest_ordering-0.6-py3-none-any.whl (4.6 kB)\n",
"Collecting selenium==4.1.3\n",
" Downloading selenium-4.1.3-py3-none-any.whl (968 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m968.8/968.8 KB\u001b[0m \u001b[31m17.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
"\u001b[?25hCollecting pytest-html==2.0.1\n",
" Downloading pytest_html-2.0.1-py2.py3-none-any.whl (15 kB)\n",
"Requirement already satisfied: pip>=22.0.4 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (22.0.4)\n",
"Collecting sbvirtualdisplay==1.1.0\n",
" Downloading sbvirtualdisplay-1.1.0-py2.py3-none-any.whl (13 kB)\n",
"Collecting soupsieve==2.3.2.post1\n",
" Using cached soupsieve-2.3.2.post1-py3-none-any.whl (37 kB)\n",
"Requirement already satisfied: cffi==1.15.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (1.15.0)\n",
"Requirement already satisfied: decorator==5.1.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (5.1.1)\n",
"Collecting pluggy==1.0.0\n",
" Downloading pluggy-1.0.0-py2.py3-none-any.whl (13 kB)\n",
"Collecting more-itertools==8.12.0\n",
" Downloading more_itertools-8.12.0-py3-none-any.whl (54 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.3/54.3 KB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: setuptools>=62.1.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (62.1.0)\n",
"Requirement already satisfied: charset-normalizer==2.0.12 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.0.12)\n",
"Requirement already satisfied: pygments==2.12.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.12.0)\n",
"Requirement already satisfied: cssselect==1.1.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (1.1.0)\n",
"Collecting pytest==7.1.2\n",
" Downloading pytest-7.1.2-py3-none-any.whl (297 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m297.0/297.0 KB\u001b[0m \u001b[31m14.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting trio-websocket==0.9.2\n",
" Downloading trio_websocket-0.9.2-py3-none-any.whl (16 kB)\n",
"Collecting wsproto==1.1.0\n",
" Downloading wsproto-1.1.0-py3-none-any.whl (24 kB)\n",
"Collecting nose==1.3.7\n",
" Downloading nose-1.3.7-py3-none-any.whl (154 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m154.7/154.7 KB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: requests==2.27.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.27.1)\n",
"Requirement already satisfied: matplotlib-inline==0.1.3 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (0.1.3)\n",
"Collecting boto==2.49.0\n",
" Downloading boto-2.49.0-py2.py3-none-any.whl (1.4 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.4/1.4 MB\u001b[0m \u001b[31m17.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting toml==0.10.2\n",
" Using cached toml-0.10.2-py2.py3-none-any.whl (16 kB)\n",
"Requirement already satisfied: jedi==0.18.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (0.18.1)\n",
"Collecting chardet==4.0.0\n",
" Downloading chardet-4.0.0-py2.py3-none-any.whl (178 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m178.7/178.7 KB\u001b[0m \u001b[31m15.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: tomli>=2.0.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.0.1)\n",
"Requirement already satisfied: colorama==0.4.4 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (0.4.4)\n",
"Collecting pytest-forked==1.4.0\n",
" Downloading pytest_forked-1.4.0-py3-none-any.whl (4.9 kB)\n",
"Collecting execnet==1.9.0\n",
" Downloading execnet-1.9.0-py2.py3-none-any.whl (39 kB)\n",
"Requirement already satisfied: certifi>=2021.10.8 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2021.10.8)\n",
"Requirement already satisfied: PyYAML>=6.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (6.0)\n",
"Requirement already satisfied: traitlets>=5.1.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (5.1.1)\n",
"Requirement already satisfied: attrs>=21.4.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (21.4.0)\n",
"Collecting rich==12.3.0\n",
" Downloading rich-12.3.0-py3-none-any.whl (232 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m232.1/232.1 KB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting fasteners==0.17.3\n",
" Downloading fasteners-0.17.3-py3-none-any.whl (18 kB)\n",
"Requirement already satisfied: prompt-toolkit==3.0.29 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (3.0.29)\n",
"Collecting pytest-rerunfailures==10.2\n",
" Downloading pytest_rerunfailures-10.2-py3-none-any.whl (11 kB)\n",
"Collecting parameterized==0.8.1\n",
" Downloading parameterized-0.8.1-py2.py3-none-any.whl (26 kB)\n",
"Collecting pymysql==1.0.2\n",
" Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.8/43.8 KB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: packaging>=21.3 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (21.3)\n",
"Requirement already satisfied: platformdirs>=2.5.2 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.5.2)\n",
"Requirement already satisfied: sniffio==1.2.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (1.2.0)\n",
"Collecting trio==0.20.0\n",
" Downloading trio-0.20.0-py3-none-any.whl (359 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m359.0/359.0 KB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting ipython==7.33.0\n",
" Downloading ipython-7.33.0-py3-none-any.whl (793 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m793.8/793.8 KB\u001b[0m \u001b[31m15.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: wheel>=0.37.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (0.37.1)\n",
"Collecting setuptools-scm>=6.4.2\n",
" Downloading setuptools_scm-6.4.2-py3-none-any.whl (37 kB)\n",
"Collecting filelock>=3.6.0\n",
" Downloading filelock-3.6.0-py3-none-any.whl (10.0 kB)\n",
"Collecting pytest-metadata==2.0.1\n",
" Downloading pytest_metadata-2.0.1-py3-none-any.whl (9.9 kB)\n",
"Collecting ipdb==0.13.9\n",
" Downloading ipdb-0.13.9.tar.gz (16 kB)\n",
" Preparing metadata (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25hRequirement already satisfied: idna==3.3 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (3.3)\n",
"Requirement already satisfied: parso==0.8.3 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (0.8.3)\n",
"Collecting py==1.11.0\n",
" Downloading py-1.11.0-py2.py3-none-any.whl (98 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.7/98.7 KB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: six==1.16.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (1.16.0)\n",
"Requirement already satisfied: beautifulsoup4==4.11.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (4.11.1)\n",
"Collecting h11==0.13.0\n",
" Downloading h11-0.13.0-py3-none-any.whl (58 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.2/58.2 KB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: pyopenssl==22.0.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (22.0.0)\n",
"Collecting cryptography==37.0.1\n",
" Downloading cryptography-37.0.1-cp36-abi3-manylinux_2_24_x86_64.whl (4.0 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.0/4.0 MB\u001b[0m \u001b[31m22.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: sortedcontainers==2.4.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.4.0)\n",
"Collecting pytest-xdist==2.5.0\n",
" Downloading pytest_xdist-2.5.0-py3-none-any.whl (41 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.7/41.7 KB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: urllib3==1.26.9 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (1.26.9)\n",
"Requirement already satisfied: pycparser==2.21 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.21)\n",
"Requirement already satisfied: tornado==6.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (6.1)\n",
"Collecting pdfminer.six==20220319\n",
" Downloading pdfminer.six-20220319-py3-none-any.whl (5.6 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m17.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting pyotp==2.6.0\n",
" Downloading pyotp-2.6.0-py2.py3-none-any.whl (11 kB)\n",
"Requirement already satisfied: Pillow==9.1.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (9.1.0)\n",
"Requirement already satisfied: pickleshare in /opt/conda/lib/python3.9/site-packages (from ipython==7.33.0->seleniumbase) (0.7.5)\n",
"Requirement already satisfied: backcall in /opt/conda/lib/python3.9/site-packages (from ipython==7.33.0->seleniumbase) (0.2.0)\n",
"Requirement already satisfied: pexpect>4.3 in /opt/conda/lib/python3.9/site-packages (from ipython==7.33.0->seleniumbase) (4.8.0)\n",
"Requirement already satisfied: wcwidth in /opt/conda/lib/python3.9/site-packages (from prompt-toolkit==3.0.29->seleniumbase) (0.2.5)\n",
"Collecting iniconfig\n",
" Downloading iniconfig-1.1.1-py2.py3-none-any.whl (5.0 kB)\n",
"Collecting commonmark<0.10.0,>=0.9.0\n",
" Downloading commonmark-0.9.1-py2.py3-none-any.whl (51 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.1/51.1 KB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: async-generator>=1.9 in /opt/conda/lib/python3.9/site-packages (from trio==0.20.0->seleniumbase) (1.10)\n",
"Collecting outcome\n",
" Downloading outcome-1.1.0-py2.py3-none-any.whl (9.7 kB)\n",
"Collecting scrapy>=1.0.0\n",
" Downloading Scrapy-2.6.1-py2.py3-none-any.whl (264 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m264.3/264.3 KB\u001b[0m \u001b[31m15.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.9/site-packages (from packaging>=21.3->seleniumbase) (3.0.8)\n",
"Collecting protego>=0.1.15\n",
" Downloading Protego-0.2.1-py2.py3-none-any.whl (8.2 kB)\n",
"Requirement already satisfied: parsel>=1.5.0 in /opt/conda/lib/python3.9/site-packages (from scrapy>=1.0.0->scrapy-selenium) (1.6.0)\n",
"Collecting tldextract\n",
" Downloading tldextract-3.2.1-py3-none-any.whl (87 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.8/87.8 KB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: lxml>=3.5.0 in /opt/conda/lib/python3.9/site-packages (from scrapy>=1.0.0->scrapy-selenium) (4.8.0)\n",
"Collecting Twisted>=17.9.0\n",
" Downloading Twisted-22.4.0-py3-none-any.whl (3.1 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m22.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
"\u001b[?25hCollecting service-identity>=16.0.0\n",
" Downloading service_identity-21.1.0-py2.py3-none-any.whl (12 kB)\n",
"Requirement already satisfied: w3lib>=1.17.0 in /opt/conda/lib/python3.9/site-packages (from scrapy>=1.0.0->scrapy-selenium) (1.22.0)\n",
"Collecting itemadapter>=0.1.0\n",
" Downloading itemadapter-0.5.0-py3-none-any.whl (10 kB)\n",
"Collecting itemloaders>=1.0.1\n",
" Downloading itemloaders-1.0.4-py3-none-any.whl (11 kB)\n",
"Collecting queuelib>=1.4.2\n",
" Downloading queuelib-1.6.2-py2.py3-none-any.whl (13 kB)\n",
"Collecting zope.interface>=4.1.3\n",
" Downloading zope.interface-5.4.0-cp39-cp39-manylinux2010_x86_64.whl (255 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m255.8/255.8 KB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting PyDispatcher>=2.0.5\n",
" Downloading PyDispatcher-2.0.5.zip (47 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m47.6/47.6 KB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25hCollecting jmespath>=0.9.5\n",
" Downloading jmespath-1.0.0-py3-none-any.whl (23 kB)\n",
"Requirement already satisfied: ptyprocess>=0.5 in /opt/conda/lib/python3.9/site-packages (from pexpect>4.3->ipython==7.33.0->seleniumbase) (0.7.0)\n",
"Collecting pyasn1-modules\n",
" Downloading pyasn1_modules-0.2.8-py2.py3-none-any.whl (155 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m155.3/155.3 KB\u001b[0m \u001b[31m14.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting pyasn1\n",
" Downloading pyasn1-0.4.8-py2.py3-none-any.whl (77 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.1/77.1 KB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting Automat>=0.8.0\n",
" Downloading Automat-20.2.0-py2.py3-none-any.whl (31 kB)\n",
"Collecting constantly>=15.1\n",
" Downloading constantly-15.1.0-py2.py3-none-any.whl (7.9 kB)\n",
"Requirement already satisfied: typing-extensions>=3.6.5 in /opt/conda/lib/python3.9/site-packages (from Twisted>=17.9.0->scrapy>=1.0.0->scrapy-selenium) (4.2.0)\n",
"Collecting incremental>=21.3.0\n",
" Downloading incremental-21.3.0-py2.py3-none-any.whl (15 kB)\n",
"Collecting hyperlink>=17.1.1\n",
" Downloading hyperlink-21.0.0-py2.py3-none-any.whl (74 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m74.6/74.6 KB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: PySocks!=1.5.7,<2.0,>=1.5.6 in /opt/conda/lib/python3.9/site-packages (from urllib3==1.26.9->seleniumbase) (1.7.1)\n",
"Collecting requests-file>=1.4\n",
" Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)\n",
"Building wheels for collected packages: ipdb, PyDispatcher\n",
" Building wheel for ipdb (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25h Created wheel for ipdb: filename=ipdb-0.13.9-py3-none-any.whl size=11646 sha256=4b32fcc05f5ab0b0fa9bb2fdf21ec3d5ae8465b51bb9d106e916a77dda392b14\n",
" Stored in directory: /home/jovyan/.cache/pip/wheels/f7/29/9a/cf774cd86e9802f075a0be1c9e0830bc062d07897b2e9e87cd\n",
" Building wheel for PyDispatcher (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25h Created wheel for PyDispatcher: filename=PyDispatcher-2.0.5-py3-none-any.whl size=11516 sha256=2d98fbbdf2c1959b062f50876512ac0499e2ceacddd48fd2cda2600024aeb3d7\n",
" Stored in directory: /home/jovyan/.cache/pip/wheels/a5/de/8a/4b52190a95d99c042ec6bd5ad2de3a3c1b5ce71d69f0bbd036\n",
"Successfully built ipdb PyDispatcher\n",
"Installing collected packages: pyotp, PyDispatcher, pyasn1, parameterized, nose, iniconfig, incremental, constantly, commonmark, boto, zope.interface, toml, soupsieve, sbvirtualdisplay, rich, queuelib, pymysql, pyasn1-modules, py, protego, pluggy, outcome, more-itertools, jmespath, itemadapter, hyperlink, h11, filelock, fasteners, execnet, chardet, Automat, wsproto, Twisted, trio, setuptools-scm, requests-file, pytest, ipython, cryptography, trio-websocket, tldextract, service-identity, pytest-rerunfailures, pytest-ordering, pytest-metadata, pytest-forked, pdfminer.six, itemloaders, ipdb, scrapy, pytest-xdist, pytest-html, selenium, seleniumbase, scrapy-selenium\n",
" Attempting uninstall: soupsieve\n",
" Found existing installation: soupsieve 2.3.1\n",
" Uninstalling soupsieve-2.3.1:\n",
" Successfully uninstalled soupsieve-2.3.1\n",
" Attempting uninstall: ipython\n",
" Found existing installation: ipython 8.3.0\n",
" Uninstalling ipython-8.3.0:\n",
" Successfully uninstalled ipython-8.3.0\n",
" Attempting uninstall: cryptography\n",
" Found existing installation: cryptography 36.0.2\n",
" Uninstalling cryptography-36.0.2:\n",
" Successfully uninstalled cryptography-36.0.2\n",
"Successfully installed Automat-20.2.0 PyDispatcher-2.0.5 Twisted-22.4.0 boto-2.49.0 chardet-4.0.0 commonmark-0.9.1 constantly-15.1.0 cryptography-37.0.1 execnet-1.9.0 fasteners-0.17.3 filelock-3.6.0 h11-0.13.0 hyperlink-21.0.0 incremental-21.3.0 iniconfig-1.1.1 ipdb-0.13.9 ipython-7.33.0 itemadapter-0.5.0 itemloaders-1.0.4 jmespath-1.0.0 more-itertools-8.12.0 nose-1.3.7 outcome-1.1.0 parameterized-0.8.1 pdfminer.six-20220319 pluggy-1.0.0 protego-0.2.1 py-1.11.0 pyasn1-0.4.8 pyasn1-modules-0.2.8 pymysql-1.0.2 pyotp-2.6.0 pytest-7.1.2 pytest-forked-1.4.0 pytest-html-2.0.1 pytest-metadata-2.0.1 pytest-ordering-0.6 pytest-rerunfailures-10.2 pytest-xdist-2.5.0 queuelib-1.6.2 requests-file-1.5.1 rich-12.3.0 sbvirtualdisplay-1.1.0 scrapy-2.6.1 scrapy-selenium-0.0.7 selenium-4.1.3 seleniumbase-2.5.4.post1 service-identity-21.1.0 setuptools-scm-6.4.2 soupsieve-2.3.2.post1 tldextract-3.2.1 toml-0.10.2 trio-0.20.0 trio-websocket-0.9.2 wsproto-1.1.0 zope.interface-5.4.0\n",
"Proceed (Y/n)? "
]
}
],
"source": [
"import sys\n",
"\n",
"!{sys.executable} -m pip install seleniumbase scrapy-selenium\n",
"\n",
"!{sys.executable} -m pip uninstall -q selenium\n",
"!{sys.executable} -m pip install --user selenium==4.0.0"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "289368e8",
"metadata": {},
"outputs": [],
"source": [
"# !pip install scrapy-selenium\n",
"\n",
"!python scrapy/learn/awesome_selfhosted/spiders/scrapy_selenium.py"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b98dc06b-c523-49a5-8202-8eaa4504da02",
"metadata": {},
"outputs": [],
"source": [
"!https://docs.scrapy.org/_/downloads/en/latest/pdf/"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ceb94f3e-047d-4902-bcd7-9e58e3d47ed7",
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"\n",
"!{sys.executable} -m pip install scrapy-pyppeteer"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f246228d-904e-475b-a192-1e2e2a7b6914",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"!python scrapy/scrapy-linkedin/scrapy-linkedin/spiders/linkedin_login.py"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "facad22a",
"metadata": {
"execution": {
"iopub.execute_input": "2021-10-16T00:29:30.210663Z",
"iopub.status.busy": "2021-10-16T00:29:30.210346Z",
"iopub.status.idle": "2021-10-16T00:29:33.121592Z",
"shell.execute_reply": "2021-10-16T00:29:33.119808Z",
"shell.execute_reply.started": "2021-10-16T00:29:30.210632Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2021-10-16 00:29:31 [scrapy.utils.log] INFO: Scrapy 2.5.1 started (bot: scrapybot)\n",
"2021-10-16 00:29:31 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.9.7 | packaged by conda-forge | (default, Sep 23 2021, 07:28:37) - [GCC 9.4.0], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l 24 Aug 2021), cryptography 35.0.0, Platform Linux-5.4.0-88-generic-x86_64-with-glibc2.31\n",
"2021-10-16 00:29:31 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor\n",
"2021-10-16 00:29:31 [scrapy.crawler] INFO: Overridden settings:\n",
"{}\n",
"2021-10-16 00:29:31 [scrapy.extensions.telnet] INFO: Telnet Password: c3ecb039fe45254e\n",
"2021-10-16 00:29:31 [scrapy.middleware] INFO: Enabled extensions:\n",
"['scrapy.extensions.corestats.CoreStats',\n",
" 'scrapy.extensions.telnet.TelnetConsole',\n",
" 'scrapy.extensions.memusage.MemoryUsage',\n",
" 'scrapy.extensions.logstats.LogStats']\n",
"2021-10-16 00:29:31 [scrapy.middleware] INFO: Enabled downloader middlewares:\n",
"['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',\n",
" 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',\n",
" 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',\n",
" 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',\n",
" 'scrapy.downloadermiddlewares.retry.RetryMiddleware',\n",
" 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',\n",
" 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',\n",
" 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',\n",
" 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',\n",
" 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',\n",
" 'scrapy.downloadermiddlewares.stats.DownloaderStats']\n",
"2021-10-16 00:29:31 [scrapy.middleware] INFO: Enabled spider middlewares:\n",
"['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',\n",
" 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',\n",
" 'scrapy.spidermiddlewares.referer.RefererMiddleware',\n",
" 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',\n",
" 'scrapy.spidermiddlewares.depth.DepthMiddleware']\n",
"2021-10-16 00:29:31 [scrapy.middleware] INFO: Enabled item pipelines:\n",
"[]\n",
"2021-10-16 00:29:31 [scrapy.core.engine] INFO: Spider opened\n",
"2021-10-16 00:29:31 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)\n",
"2021-10-16 00:29:31 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023\n",
"2021-10-16 00:29:31 [py.warnings] WARNING: /opt/conda/lib/python3.9/site-packages/scrapy_splash/request.py:42: ScrapyDeprecationWarning: Call to deprecated function to_native_str. Use to_unicode instead.\n",
" url = to_native_str(url)\n",
"\n",
"2021-10-16 00:29:31 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://github.com/awesome-selfhosted/awesome-selfhosted> (referer: None)\n",
"2021-10-16 00:29:32 [scrapy.core.engine] INFO: Closing spider (finished)\n",
"2021-10-16 00:29:32 [scrapy.statscollectors] INFO: Dumping Scrapy stats:\n",
"{'downloader/request_bytes': 251,\n",
" 'downloader/request_count': 1,\n",
" 'downloader/request_method_count/GET': 1,\n",
" 'downloader/response_bytes': 125130,\n",
" 'downloader/response_count': 1,\n",
" 'downloader/response_status_count/200': 1,\n",
" 'elapsed_time_seconds': 1.484227,\n",
" 'finish_reason': 'finished',\n",
" 'finish_time': datetime.datetime(2021, 10, 16, 0, 29, 32, 805803),\n",
" 'httpcompression/response_bytes': 637769,\n",
" 'httpcompression/response_count': 1,\n",
" 'log_count/DEBUG': 1,\n",
" 'log_count/INFO': 10,\n",
" 'log_count/WARNING': 1,\n",
" 'memusage/max': 61575168,\n",
" 'memusage/startup': 61575168,\n",
" 'response_received_count': 1,\n",
" 'scheduler/dequeued': 1,\n",
" 'scheduler/dequeued/memory': 1,\n",
" 'scheduler/enqueued': 1,\n",
" 'scheduler/enqueued/memory': 1,\n",
" 'start_time': datetime.datetime(2021, 10, 16, 0, 29, 31, 321576)}\n",
"2021-10-16 00:29:32 [scrapy.core.engine] INFO: Spider closed (finished)\n"
]
}
],
"source": [
"!python scrapy/github/awesome/spiders/awesome_list.py"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "2f3cbfcf",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"\n",
"r = requests.get(\n",
" \"http://splash:8050/render.json?url=http://domain.com/page-with-javascript.html&timeout=10&wait=0.5\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "2a7ed7d2-d379-489e-9679-ab818c31a395",
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "'Response' object has no attribute 'status'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_387/4275213714.py\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstatus\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m: 'Response' object has no attribute 'status'"
]
}
],
"source": [
"print(r.status)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7aa9bbb4-f83e-4fb9-be62-e10d361177d2",
"metadata": {},
"outputs": [],
"source": [
"!python scrapy/github/awesome_selfhosted/spiders/quotes.py"
]
},
{
"cell_type": "markdown",
"id": "29fb8d4b-32ee-4b16-bd6b-00e042aa807e",
"metadata": {},
"source": [
"# Scrapy Integration Testing"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "9027f1aa-a239-46f7-bf4f-04f50b11120d",
"metadata": {
"execution": {
"iopub.execute_input": "2021-10-16T00:25:34.937091Z",
"iopub.status.busy": "2021-10-16T00:25:34.936806Z",
"iopub.status.idle": "2021-10-16T00:25:36.936409Z",
"shell.execute_reply": "2021-10-16T00:25:36.935417Z",
"shell.execute_reply.started": "2021-10-16T00:25:34.937066Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2021-10-16 00:25:35 [scrapy.utils.log] INFO: Scrapy 2.5.1 started (bot: scrapybot)\n",
"2021-10-16 00:25:35 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.9.7 | packaged by conda-forge | (default, Sep 23 2021, 07:28:37) - [GCC 9.4.0], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l 24 Aug 2021), cryptography 35.0.0, Platform Linux-5.4.0-88-generic-x86_64-with-glibc2.31\n",
"2021-10-16 00:25:35 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor\n",
"2021-10-16 00:25:35 [scrapy.crawler] INFO: Overridden settings:\n",
"{}\n",
"2021-10-16 00:25:35 [scrapy.extensions.telnet] INFO: Telnet Password: b8a082d3f20f441c\n",
"2021-10-16 00:25:35 [scrapy.middleware] INFO: Enabled extensions:\n",
"['scrapy.extensions.corestats.CoreStats',\n",
" 'scrapy.extensions.telnet.TelnetConsole',\n",
" 'scrapy.extensions.memusage.MemoryUsage',\n",
" 'scrapy.extensions.logstats.LogStats']\n",
"2021-10-16 00:25:35 [scrapy.middleware] INFO: Enabled downloader middlewares:\n",
"['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',\n",
" 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',\n",
" 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',\n",
" 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',\n",
" 'scrapy.downloadermiddlewares.retry.RetryMiddleware',\n",
" 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',\n",
" 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',\n",
" 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',\n",
" 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',\n",
" 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',\n",
" 'scrapy.downloadermiddlewares.stats.DownloaderStats']\n",
"2021-10-16 00:25:35 [scrapy.middleware] INFO: Enabled spider middlewares:\n",
"['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',\n",
" 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',\n",
" 'scrapy.spidermiddlewares.referer.RefererMiddleware',\n",
" 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',\n",
" 'scrapy.spidermiddlewares.depth.DepthMiddleware']\n",
"2021-10-16 00:25:35 [scrapy.middleware] INFO: Enabled item pipelines:\n",
"[]\n",
"2021-10-16 00:25:35 [scrapy.core.engine] INFO: Spider opened\n",
"2021-10-16 00:25:35 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)\n",
"2021-10-16 00:25:36 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023\n",
"2021-10-16 00:25:36 [py.warnings] WARNING: /opt/conda/lib/python3.9/site-packages/scrapy_splash/request.py:42: ScrapyDeprecationWarning: Call to deprecated function to_native_str. Use to_unicode instead.\n",
" url = to_native_str(url)\n",
"\n",
"2021-10-16 00:25:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://portfolio.donavanaldrich.com> (referer: None)\n",
"2021-10-16 00:25:36 [scrapy.core.scraper] ERROR: Spider error processing <GET https://portfolio.donavanaldrich.com> (referer: None)\n",
"Traceback (most recent call last):\n",
" File \"/opt/conda/lib/python3.9/site-packages/twisted/internet/defer.py\", line 858, in _runCallbacks\n",
" current.result = callback( # type: ignore[misc]\n",
" File \"/home/jovyan/code/scrapy/github/awesome/spiders/splash_test.py\", line 29, in parse\n",
" png_bytes = base64.b64decode(response.data['png'])\n",
"AttributeError: 'HtmlResponse' object has no attribute 'data'\n",
"2021-10-16 00:25:36 [scrapy.core.engine] INFO: Closing spider (finished)\n",
"2021-10-16 00:25:36 [scrapy.statscollectors] INFO: Dumping Scrapy stats:\n",
"{'downloader/request_bytes': 232,\n",
" 'downloader/request_count': 1,\n",
" 'downloader/request_method_count/GET': 1,\n",
" 'downloader/response_bytes': 30168,\n",
" 'downloader/response_count': 1,\n",
" 'downloader/response_status_count/200': 1,\n",
" 'elapsed_time_seconds': 0.689888,\n",
" 'finish_reason': 'finished',\n",
" 'finish_time': datetime.datetime(2021, 10, 16, 0, 25, 36, 688102),\n",
" 'httpcompression/response_bytes': 208226,\n",
" 'httpcompression/response_count': 1,\n",
" 'log_count/DEBUG': 1,\n",
" 'log_count/ERROR': 1,\n",
" 'log_count/INFO': 10,\n",
" 'log_count/WARNING': 1,\n",
" 'memusage/max': 59232256,\n",
" 'memusage/startup': 59232256,\n",
" 'response_received_count': 1,\n",
" 'scheduler/dequeued': 1,\n",
" 'scheduler/dequeued/memory': 1,\n",
" 'scheduler/enqueued': 1,\n",
" 'scheduler/enqueued/memory': 1,\n",
" 'spider_exceptions/AttributeError': 1,\n",
" 'start_time': datetime.datetime(2021, 10, 16, 0, 25, 35, 998214)}\n",
"2021-10-16 00:25:36 [scrapy.core.engine] INFO: Spider closed (finished)\n"
]
}
],
"source": [
"!python scrapy/github/awesome/spiders/splash_test.py"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "1a956486-2814-440d-a844-3be8769f4d74",
"metadata": {
"execution": {
"iopub.execute_input": "2021-10-16T00:28:08.970348Z",
"iopub.status.busy": "2021-10-16T00:28:08.970102Z",
"iopub.status.idle": "2021-10-16T00:28:10.849795Z",
"shell.execute_reply": "2021-10-16T00:28:10.848508Z",
"shell.execute_reply.started": "2021-10-16T00:28:08.970324Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2021-10-16 00:28:09 [scrapy.utils.log] INFO: Scrapy 2.5.1 started (bot: scrapybot)\n",
"2021-10-16 00:28:09 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.9.7 | packaged by conda-forge | (default, Sep 23 2021, 07:28:37) - [GCC 9.4.0], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l 24 Aug 2021), cryptography 35.0.0, Platform Linux-5.4.0-88-generic-x86_64-with-glibc2.31\n",
"2021-10-16 00:28:09 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor\n",
"2021-10-16 00:28:09 [scrapy.crawler] INFO: Overridden settings:\n",
"{}\n",
"2021-10-16 00:28:09 [scrapy.extensions.telnet] INFO: Telnet Password: 5715398da4f12340\n",
"2021-10-16 00:28:09 [scrapy.middleware] INFO: Enabled extensions:\n",
"['scrapy.extensions.corestats.CoreStats',\n",
" 'scrapy.extensions.telnet.TelnetConsole',\n",
" 'scrapy.extensions.memusage.MemoryUsage',\n",
" 'scrapy.extensions.logstats.LogStats']\n",
"2021-10-16 00:28:09 [scrapy.middleware] WARNING: Disabled SeleniumMiddleware: SELENIUM_DRIVER_NAME and SELENIUM_DRIVER_EXECUTABLE_PATH must be set\n",
"2021-10-16 00:28:09 [scrapy.middleware] INFO: Enabled downloader middlewares:\n",
"['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',\n",
" 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',\n",
" 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',\n",
" 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',\n",
" 'scrapy.downloadermiddlewares.retry.RetryMiddleware',\n",
" 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',\n",
" 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',\n",
" 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',\n",
" 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',\n",
" 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',\n",
" 'scrapy.downloadermiddlewares.stats.DownloaderStats']\n",
"2021-10-16 00:28:09 [scrapy.middleware] INFO: Enabled spider middlewares:\n",
"['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',\n",
" 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',\n",
" 'scrapy.spidermiddlewares.referer.RefererMiddleware',\n",
" 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',\n",
" 'scrapy.spidermiddlewares.depth.DepthMiddleware']\n",
"2021-10-16 00:28:09 [scrapy.middleware] INFO: Enabled item pipelines:\n",
"[]\n",
"2021-10-16 00:28:09 [scrapy.core.engine] INFO: Spider opened\n",
"2021-10-16 00:28:09 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)\n",
"2021-10-16 00:28:10 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023\n",
"2021-10-16 00:28:10 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.google.com/> from <GET https://google.com>\n",
"2021-10-16 00:28:10 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.google.com/> (referer: None)\n",
"2021-10-16 00:28:10 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.google.com/> (referer: None)\n",
"Traceback (most recent call last):\n",
" File \"/opt/conda/lib/python3.9/site-packages/twisted/internet/defer.py\", line 858, in _runCallbacks\n",
" current.result = callback( # type: ignore[misc]\n",
" File \"/home/jovyan/code/scrapy/github/awesome/spiders/selenium_test.py\", line 29, in parse_result\n",
" print(response.request.meta['driver'].title)\n",
"KeyError: 'driver'\n",
"2021-10-16 00:28:10 [scrapy.core.engine] INFO: Closing spider (finished)\n",
"2021-10-16 00:28:10 [scrapy.statscollectors] INFO: Dumping Scrapy stats:\n",
"{'downloader/request_bytes': 432,\n",
" 'downloader/request_count': 2,\n",
" 'downloader/request_method_count/GET': 2,\n",
" 'downloader/response_bytes': 7820,\n",
" 'downloader/response_count': 2,\n",
" 'downloader/response_status_count/200': 1,\n",
" 'downloader/response_status_count/301': 1,\n",
" 'elapsed_time_seconds': 0.602981,\n",
" 'finish_reason': 'finished',\n",
" 'finish_time': datetime.datetime(2021, 10, 16, 0, 28, 10, 602166),\n",
" 'httpcompression/response_bytes': 14104,\n",
" 'httpcompression/response_count': 1,\n",
" 'log_count/DEBUG': 2,\n",
" 'log_count/ERROR': 1,\n",
" 'log_count/INFO': 10,\n",
" 'log_count/WARNING': 1,\n",
" 'memusage/max': 56184832,\n",
" 'memusage/startup': 56184832,\n",
" 'response_received_count': 1,\n",
" 'scheduler/dequeued': 2,\n",
" 'scheduler/dequeued/memory': 2,\n",
" 'scheduler/enqueued': 2,\n",
" 'scheduler/enqueued/memory': 2,\n",
" 'spider_exceptions/KeyError': 1,\n",
" 'start_time': datetime.datetime(2021, 10, 16, 0, 28, 9, 999185)}\n",
"2021-10-16 00:28:10 [scrapy.core.engine] INFO: Spider closed (finished)\n"
]
}
],
"source": [
"!python scrapy/github/awesome/spiders/selenium_test.py"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "168fad05-89c9-4d48-9b04-8d1f75be4745",
"metadata": {},
"outputs": [],
"source": [
"from utils.browser.test import *\n",
"\n",
"AnonymousBrowserTest()\n",
"\n",
"!python utils/browser/test.py"
]
},
{
"cell_type": "markdown",
"id": "cd7bf7a6-3bfa-4577-ab18-7f21d9564e31",
"metadata": {},
"source": [
"Testing"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5369ec44-81f0-4875-a74f-419c7ab9e5dd",
"metadata": {},
"outputs": [],
"source": [
"from utils.browser.network import *\n",
"\n",
"chrome_options = webdriver.ChromeOptions()\n",
"\n",
"# capabilities = browser_capabilities(module)\n",
"\n",
"print(\"Webdriver Initializing\")\n",
"\n",
"driver = webdriver.Remote(\n",
" command_executor=\"http://192.168.1.101:4444/wd/hub\",\n",
" options=chrome_options,\n",
" # desired_capabilities=capabilities,\n",
")\n",
"\n",
"\n",
"log.info(\"Proxy IP info,,,\")\n",
"\n",
"proxy_ip(\"data/proxy_ip.json\")\n",
"\n",
"log.info(\"VPN Settings...\")\n",
"\n",
"vpn_settings(\"data/vpn.json\")\n",
"\n",
"log.info(\"Selenoid Status...\")\n",
"\n",
"selenoid_status(\"data/selenoid_status.json\")\n",
"\n",
"log.info(\"Real IP...\")\n",
"\n",
"ip_status(driver)\n",
"\n",
"log.info(\"Browser Settings...\")\n",
"\n",
"browser_config(driver)\n",
"\n",
"log.info(\"Random User Agent...\")\n",
"\n",
"# user_agent(driver)\n",
"\n",
"# driver_object(driver)\n",
"\n",
"# browser_feature(driver)\n",
"\n",
"# uc_test()\n",
"\n",
"driver.quit()\n",
"\n",
"log.info(\"Session Terminated!\")\n",
"\n",
"log.info(\"test Finished\")\n",
"# return"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "cfad181f-0f1e-43b2-9479-df868e380e16",
"metadata": {
"execution": {
"iopub.execute_input": "2021-10-16T00:23:23.049733Z",
"iopub.status.busy": "2021-10-16T00:23:23.049274Z",
"iopub.status.idle": "2021-10-16T00:23:23.187517Z",
"shell.execute_reply": "2021-10-16T00:23:23.182088Z",
"shell.execute_reply.started": "2021-10-16T00:23:23.049695Z"
},
"tags": []
},
"outputs": [
{
"ename": "WebDriverException",
"evalue": "Message: <!DOCTYPE html>\n<html lang=\"en\">\n<head>\n<meta charset=\"utf-8\">\n<title>Error</title>\n</head>\n<body>\n<pre>Cannot POST /session</pre>\n</body>\n</html>\n\n",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mWebDriverException\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_111/2032127904.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mchrome_options\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwebdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mChromeOptions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m driver = webdriver.Remote(\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mcommand_executor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"http://192.168.1.101:3148\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moptions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchrome_options\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m )\n",
"\u001b[0;32m/opt/conda/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, command_executor, desired_capabilities, browser_profile, proxy, keep_alive, file_detector, options)\u001b[0m\n\u001b[1;32m 155\u001b[0m warnings.warn(\"Please use FirefoxOptions to set browser profile\",\n\u001b[1;32m 156\u001b[0m DeprecationWarning, stacklevel=2)\n\u001b[0;32m--> 157\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart_session\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcapabilities\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbrowser_profile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 158\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_switch_to\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSwitchTo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_mobile\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mMobile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py\u001b[0m in \u001b[0;36mstart_session\u001b[0;34m(self, capabilities, browser_profile)\u001b[0m\n\u001b[1;32m 250\u001b[0m parameters = {\"capabilities\": w3c_caps,\n\u001b[1;32m 251\u001b[0m \"desiredCapabilities\": capabilities}\n\u001b[0;32m--> 252\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mCommand\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mNEW_SESSION\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparameters\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 253\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'sessionId'\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 254\u001b[0m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'value'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py\u001b[0m in \u001b[0;36mexecute\u001b[0;34m(self, driver_command, params)\u001b[0m\n\u001b[1;32m 319\u001b[0m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommand_executor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdriver_command\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 320\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 321\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merror_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcheck_response\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 322\u001b[0m response['value'] = self._unwrap_value(\n\u001b[1;32m 323\u001b[0m response.get('value', None))\n",
"\u001b[0;32m/opt/conda/lib/python3.9/site-packages/selenium/webdriver/remote/errorhandler.py\u001b[0m in \u001b[0;36mcheck_response\u001b[0;34m(self, response)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mexception_class\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mErrorInResponseException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 208\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 209\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmessage\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"\"\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m'message'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0mmessage\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'message'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mWebDriverException\u001b[0m: Message: <!DOCTYPE html>\n<html lang=\"en\">\n<head>\n<meta charset=\"utf-8\">\n<title>Error</title>\n</head>\n<body>\n<pre>Cannot POST /session</pre>\n</body>\n</html>\n\n"
]
}
],
"source": [
"from linkedin_scraper import Person, actions\n",
"from selenium import webdriver\n",
"\n",
"chrome_options = webdriver.ChromeOptions()\n",
"driver = webdriver.Remote(\n",
" command_executor=\"http://192.168.1.101:3148\", options=chrome_options\n",
")\n",
"\n",
"email = \"aldrich.don@gmail.com\"\n",
"password = os.getenv(password)\n",
"actions.login(\n",
" driver, email, password\n",
") # if email and password isnt given, it'll prompt in terminal\n",
"person = Person(\"https://www.linkedin.com/in/don-aldrich-48a51815\", driver=driver)\n",
"print(person)\n",
"# driver.get(\"https://www.linkedin.com/in/don-aldrich-48a51815\")\n",
"# driver.save_screenshot('./linkedin.png')\n",
"# driver.quit()\n",
"# driver = webdriver.Chrome()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bc65546e-a896-472c-ad49-e1258a885fd2",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import sys\n",
"\n",
"!{sys.executable} -m pip install linkedin_scraper"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8d87b011-9c30-43ff-b259-a93fee86055a",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'widgets' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_387/914693869.py\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m out = widgets.FloatProgress(\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m7.5\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mmin\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mmax\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10.0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mstep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'widgets' is not defined"
]
}
],
"source": [
"out = widgets.FloatProgress(\n",
" value=7.5,\n",
" min=0,\n",
" max=10.0,\n",
" step=0.1,\n",
" description=\"Loading:\",\n",
" bar_style=\"info\",\n",
" orientation=\"horizontal\",\n",
")\n",
"out"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "587faeda-7336-42fd-bf2a-b389edb20252",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'widgets' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_387/1437867774.py\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwidgets\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mOutput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlayout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'border'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'1px solid black'\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'widgets' is not defined"
]
}
],
"source": [
"out = widgets.Output(layout={\"border\": \"1px solid black\"})\n",
"out"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fe9848c5-c1c2-4adf-b879-b3afd0ebae5e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}