You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
58 KiB
58 KiB
<html>
<head>
</head>
</html>
Scrapers
In [2]:
import sys
!{sys.executable} -m pip install -U jedi-language-server
Collecting jedi-language-server
Downloading jedi_language_server-0.36.0-py3-none-any.whl (30 kB)
Collecting pydantic<2.0,>=1.7
Downloading pydantic-1.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.2 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.2/12.2 MB 19.2 MB/s eta 0:00:0000:0100:01
Collecting pygls<0.12.0,>=0.11.1
Downloading pygls-0.11.3-py3-none-any.whl (86 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 86.7/86.7 KB 3.0 MB/s eta 0:00:00
Collecting docstring-to-markdown<1.0.0
Downloading docstring_to_markdown-0.10-py3-none-any.whl (17 kB)
Requirement already satisfied: jedi<0.19.0,>=0.18.0 in /opt/conda/lib/python3.9/site-packages (from jedi-language-server) (0.18.1)
Requirement already satisfied: parso<0.9.0,>=0.8.0 in /opt/conda/lib/python3.9/site-packages (from jedi<0.19.0,>=0.18.0->jedi-language-server) (0.8.3)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.9/site-packages (from pydantic<2.0,>=1.7->jedi-language-server) (4.2.0)
Collecting pydantic<2.0,>=1.7
Downloading pydantic-1.8.2-cp39-cp39-manylinux2014_x86_64.whl (11.3 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 11.3/11.3 MB 22.1 MB/s eta 0:00:0000:0100:01
Collecting typeguard<3,>=2.10.0
Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, pydantic, docstring-to-markdown, pygls, jedi-language-server
Successfully installed docstring-to-markdown-0.10 jedi-language-server-0.36.0 pydantic-1.8.2 pygls-0.11.3 typeguard-2.13.3
In [ ]:
import sys
!{sys.executable} -m pip install seleniumbase scrapy-selenium
!{sys.executable} -m pip uninstall -q selenium
!{sys.executable} -m pip install --user selenium==4.0.0
Collecting seleniumbase
Downloading seleniumbase-2.5.4.post1-py2.py3-none-any.whl (439 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 439.4/439.4 KB 7.1 MB/s eta 0:00:00a 0:00:01
Collecting scrapy-selenium
Downloading scrapy_selenium-0.0.7-py3-none-any.whl (6.7 kB)
Collecting pytest-ordering==0.6
Downloading pytest_ordering-0.6-py3-none-any.whl (4.6 kB)
Collecting selenium==4.1.3
Downloading selenium-4.1.3-py3-none-any.whl (968 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 968.8/968.8 KB 17.0 MB/s eta 0:00:00a 0:00:01
Collecting pytest-html==2.0.1
Downloading pytest_html-2.0.1-py2.py3-none-any.whl (15 kB)
Requirement already satisfied: pip>=22.0.4 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (22.0.4)
Collecting sbvirtualdisplay==1.1.0
Downloading sbvirtualdisplay-1.1.0-py2.py3-none-any.whl (13 kB)
Collecting soupsieve==2.3.2.post1
Using cached soupsieve-2.3.2.post1-py3-none-any.whl (37 kB)
Requirement already satisfied: cffi==1.15.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (1.15.0)
Requirement already satisfied: decorator==5.1.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (5.1.1)
Collecting pluggy==1.0.0
Downloading pluggy-1.0.0-py2.py3-none-any.whl (13 kB)
Collecting more-itertools==8.12.0
Downloading more_itertools-8.12.0-py3-none-any.whl (54 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 54.3/54.3 KB 6.1 MB/s eta 0:00:00
Requirement already satisfied: setuptools>=62.1.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (62.1.0)
Requirement already satisfied: charset-normalizer==2.0.12 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.0.12)
Requirement already satisfied: pygments==2.12.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.12.0)
Requirement already satisfied: cssselect==1.1.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (1.1.0)
Collecting pytest==7.1.2
Downloading pytest-7.1.2-py3-none-any.whl (297 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 297.0/297.0 KB 14.1 MB/s eta 0:00:00
Collecting trio-websocket==0.9.2
Downloading trio_websocket-0.9.2-py3-none-any.whl (16 kB)
Collecting wsproto==1.1.0
Downloading wsproto-1.1.0-py3-none-any.whl (24 kB)
Collecting nose==1.3.7
Downloading nose-1.3.7-py3-none-any.whl (154 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 154.7/154.7 KB 9.2 MB/s eta 0:00:00
Requirement already satisfied: requests==2.27.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.27.1)
Requirement already satisfied: matplotlib-inline==0.1.3 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (0.1.3)
Collecting boto==2.49.0
Downloading boto-2.49.0-py2.py3-none-any.whl (1.4 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.4/1.4 MB 17.5 MB/s eta 0:00:0000:0100:01
Collecting toml==0.10.2
Using cached toml-0.10.2-py2.py3-none-any.whl (16 kB)
Requirement already satisfied: jedi==0.18.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (0.18.1)
Collecting chardet==4.0.0
Downloading chardet-4.0.0-py2.py3-none-any.whl (178 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 178.7/178.7 KB 15.1 MB/s eta 0:00:00
Requirement already satisfied: tomli>=2.0.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.0.1)
Requirement already satisfied: colorama==0.4.4 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (0.4.4)
Collecting pytest-forked==1.4.0
Downloading pytest_forked-1.4.0-py3-none-any.whl (4.9 kB)
Collecting execnet==1.9.0
Downloading execnet-1.9.0-py2.py3-none-any.whl (39 kB)
Requirement already satisfied: certifi>=2021.10.8 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2021.10.8)
Requirement already satisfied: PyYAML>=6.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (6.0)
Requirement already satisfied: traitlets>=5.1.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (5.1.1)
Requirement already satisfied: attrs>=21.4.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (21.4.0)
Collecting rich==12.3.0
Downloading rich-12.3.0-py3-none-any.whl (232 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 232.1/232.1 KB 9.3 MB/s eta 0:00:00
Collecting fasteners==0.17.3
Downloading fasteners-0.17.3-py3-none-any.whl (18 kB)
Requirement already satisfied: prompt-toolkit==3.0.29 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (3.0.29)
Collecting pytest-rerunfailures==10.2
Downloading pytest_rerunfailures-10.2-py3-none-any.whl (11 kB)
Collecting parameterized==0.8.1
Downloading parameterized-0.8.1-py2.py3-none-any.whl (26 kB)
Collecting pymysql==1.0.2
Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 43.8/43.8 KB 4.0 MB/s eta 0:00:00
Requirement already satisfied: packaging>=21.3 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (21.3)
Requirement already satisfied: platformdirs>=2.5.2 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.5.2)
Requirement already satisfied: sniffio==1.2.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (1.2.0)
Collecting trio==0.20.0
Downloading trio-0.20.0-py3-none-any.whl (359 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 359.0/359.0 KB 12.1 MB/s eta 0:00:00
Collecting ipython==7.33.0
Downloading ipython-7.33.0-py3-none-any.whl (793 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 793.8/793.8 KB 15.3 MB/s eta 0:00:00a 0:00:01
Requirement already satisfied: wheel>=0.37.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (0.37.1)
Collecting setuptools-scm>=6.4.2
Downloading setuptools_scm-6.4.2-py3-none-any.whl (37 kB)
Collecting filelock>=3.6.0
Downloading filelock-3.6.0-py3-none-any.whl (10.0 kB)
Collecting pytest-metadata==2.0.1
Downloading pytest_metadata-2.0.1-py3-none-any.whl (9.9 kB)
Collecting ipdb==0.13.9
Downloading ipdb-0.13.9.tar.gz (16 kB)
Preparing metadata (setup.py) ... done
Requirement already satisfied: idna==3.3 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (3.3)
Requirement already satisfied: parso==0.8.3 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (0.8.3)
Collecting py==1.11.0
Downloading py-1.11.0-py2.py3-none-any.whl (98 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 98.7/98.7 KB 9.7 MB/s eta 0:00:00
Requirement already satisfied: six==1.16.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (1.16.0)
Requirement already satisfied: beautifulsoup4==4.11.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (4.11.1)
Collecting h11==0.13.0
Downloading h11-0.13.0-py3-none-any.whl (58 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 58.2/58.2 KB 6.2 MB/s eta 0:00:00
Requirement already satisfied: pyopenssl==22.0.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (22.0.0)
Collecting cryptography==37.0.1
Downloading cryptography-37.0.1-cp36-abi3-manylinux_2_24_x86_64.whl (4.0 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.0/4.0 MB 22.0 MB/s eta 0:00:0000:0100:01
Requirement already satisfied: sortedcontainers==2.4.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.4.0)
Collecting pytest-xdist==2.5.0
Downloading pytest_xdist-2.5.0-py3-none-any.whl (41 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 41.7/41.7 KB 3.4 MB/s eta 0:00:00
Requirement already satisfied: urllib3==1.26.9 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (1.26.9)
Requirement already satisfied: pycparser==2.21 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (2.21)
Requirement already satisfied: tornado==6.1 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (6.1)
Collecting pdfminer.six==20220319
Downloading pdfminer.six-20220319-py3-none-any.whl (5.6 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.6/5.6 MB 17.9 MB/s eta 0:00:0000:0100:01
Collecting pyotp==2.6.0
Downloading pyotp-2.6.0-py2.py3-none-any.whl (11 kB)
Requirement already satisfied: Pillow==9.1.0 in /opt/conda/lib/python3.9/site-packages (from seleniumbase) (9.1.0)
Requirement already satisfied: pickleshare in /opt/conda/lib/python3.9/site-packages (from ipython==7.33.0->seleniumbase) (0.7.5)
Requirement already satisfied: backcall in /opt/conda/lib/python3.9/site-packages (from ipython==7.33.0->seleniumbase) (0.2.0)
Requirement already satisfied: pexpect>4.3 in /opt/conda/lib/python3.9/site-packages (from ipython==7.33.0->seleniumbase) (4.8.0)
Requirement already satisfied: wcwidth in /opt/conda/lib/python3.9/site-packages (from prompt-toolkit==3.0.29->seleniumbase) (0.2.5)
Collecting iniconfig
Downloading iniconfig-1.1.1-py2.py3-none-any.whl (5.0 kB)
Collecting commonmark<0.10.0,>=0.9.0
Downloading commonmark-0.9.1-py2.py3-none-any.whl (51 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 51.1/51.1 KB 5.2 MB/s eta 0:00:00
Requirement already satisfied: async-generator>=1.9 in /opt/conda/lib/python3.9/site-packages (from trio==0.20.0->seleniumbase) (1.10)
Collecting outcome
Downloading outcome-1.1.0-py2.py3-none-any.whl (9.7 kB)
Collecting scrapy>=1.0.0
Downloading Scrapy-2.6.1-py2.py3-none-any.whl (264 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 264.3/264.3 KB 15.3 MB/s eta 0:00:00
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.9/site-packages (from packaging>=21.3->seleniumbase) (3.0.8)
Collecting protego>=0.1.15
Downloading Protego-0.2.1-py2.py3-none-any.whl (8.2 kB)
Requirement already satisfied: parsel>=1.5.0 in /opt/conda/lib/python3.9/site-packages (from scrapy>=1.0.0->scrapy-selenium) (1.6.0)
Collecting tldextract
Downloading tldextract-3.2.1-py3-none-any.whl (87 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 87.8/87.8 KB 8.3 MB/s eta 0:00:00
Requirement already satisfied: lxml>=3.5.0 in /opt/conda/lib/python3.9/site-packages (from scrapy>=1.0.0->scrapy-selenium) (4.8.0)
Collecting Twisted>=17.9.0
Downloading Twisted-22.4.0-py3-none-any.whl (3.1 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.1/3.1 MB 22.3 MB/s eta 0:00:00a 0:00:01
Collecting service-identity>=16.0.0
Downloading service_identity-21.1.0-py2.py3-none-any.whl (12 kB)
Requirement already satisfied: w3lib>=1.17.0 in /opt/conda/lib/python3.9/site-packages (from scrapy>=1.0.0->scrapy-selenium) (1.22.0)
Collecting itemadapter>=0.1.0
Downloading itemadapter-0.5.0-py3-none-any.whl (10 kB)
Collecting itemloaders>=1.0.1
Downloading itemloaders-1.0.4-py3-none-any.whl (11 kB)
Collecting queuelib>=1.4.2
Downloading queuelib-1.6.2-py2.py3-none-any.whl (13 kB)
Collecting zope.interface>=4.1.3
Downloading zope.interface-5.4.0-cp39-cp39-manylinux2010_x86_64.whl (255 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 255.8/255.8 KB 13.6 MB/s eta 0:00:00
Collecting PyDispatcher>=2.0.5
Downloading PyDispatcher-2.0.5.zip (47 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 47.6/47.6 KB 4.5 MB/s eta 0:00:00
Preparing metadata (setup.py) ... done
Collecting jmespath>=0.9.5
Downloading jmespath-1.0.0-py3-none-any.whl (23 kB)
Requirement already satisfied: ptyprocess>=0.5 in /opt/conda/lib/python3.9/site-packages (from pexpect>4.3->ipython==7.33.0->seleniumbase) (0.7.0)
Collecting pyasn1-modules
Downloading pyasn1_modules-0.2.8-py2.py3-none-any.whl (155 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 155.3/155.3 KB 14.0 MB/s eta 0:00:00
Collecting pyasn1
Downloading pyasn1-0.4.8-py2.py3-none-any.whl (77 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 77.1/77.1 KB 8.1 MB/s eta 0:00:00
Collecting Automat>=0.8.0
Downloading Automat-20.2.0-py2.py3-none-any.whl (31 kB)
Collecting constantly>=15.1
Downloading constantly-15.1.0-py2.py3-none-any.whl (7.9 kB)
Requirement already satisfied: typing-extensions>=3.6.5 in /opt/conda/lib/python3.9/site-packages (from Twisted>=17.9.0->scrapy>=1.0.0->scrapy-selenium) (4.2.0)
Collecting incremental>=21.3.0
Downloading incremental-21.3.0-py2.py3-none-any.whl (15 kB)
Collecting hyperlink>=17.1.1
Downloading hyperlink-21.0.0-py2.py3-none-any.whl (74 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 74.6/74.6 KB 9.1 MB/s eta 0:00:00
Requirement already satisfied: PySocks!=1.5.7,<2.0,>=1.5.6 in /opt/conda/lib/python3.9/site-packages (from urllib3==1.26.9->seleniumbase) (1.7.1)
Collecting requests-file>=1.4
Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)
Building wheels for collected packages: ipdb, PyDispatcher
Building wheel for ipdb (setup.py) ... done
Created wheel for ipdb: filename=ipdb-0.13.9-py3-none-any.whl size=11646 sha256=4b32fcc05f5ab0b0fa9bb2fdf21ec3d5ae8465b51bb9d106e916a77dda392b14
Stored in directory: /home/jovyan/.cache/pip/wheels/f7/29/9a/cf774cd86e9802f075a0be1c9e0830bc062d07897b2e9e87cd
Building wheel for PyDispatcher (setup.py) ... done
Created wheel for PyDispatcher: filename=PyDispatcher-2.0.5-py3-none-any.whl size=11516 sha256=2d98fbbdf2c1959b062f50876512ac0499e2ceacddd48fd2cda2600024aeb3d7
Stored in directory: /home/jovyan/.cache/pip/wheels/a5/de/8a/4b52190a95d99c042ec6bd5ad2de3a3c1b5ce71d69f0bbd036
Successfully built ipdb PyDispatcher
Installing collected packages: pyotp, PyDispatcher, pyasn1, parameterized, nose, iniconfig, incremental, constantly, commonmark, boto, zope.interface, toml, soupsieve, sbvirtualdisplay, rich, queuelib, pymysql, pyasn1-modules, py, protego, pluggy, outcome, more-itertools, jmespath, itemadapter, hyperlink, h11, filelock, fasteners, execnet, chardet, Automat, wsproto, Twisted, trio, setuptools-scm, requests-file, pytest, ipython, cryptography, trio-websocket, tldextract, service-identity, pytest-rerunfailures, pytest-ordering, pytest-metadata, pytest-forked, pdfminer.six, itemloaders, ipdb, scrapy, pytest-xdist, pytest-html, selenium, seleniumbase, scrapy-selenium
Attempting uninstall: soupsieve
Found existing installation: soupsieve 2.3.1
Uninstalling soupsieve-2.3.1:
Successfully uninstalled soupsieve-2.3.1
Attempting uninstall: ipython
Found existing installation: ipython 8.3.0
Uninstalling ipython-8.3.0:
Successfully uninstalled ipython-8.3.0
Attempting uninstall: cryptography
Found existing installation: cryptography 36.0.2
Uninstalling cryptography-36.0.2:
Successfully uninstalled cryptography-36.0.2
Successfully installed Automat-20.2.0 PyDispatcher-2.0.5 Twisted-22.4.0 boto-2.49.0 chardet-4.0.0 commonmark-0.9.1 constantly-15.1.0 cryptography-37.0.1 execnet-1.9.0 fasteners-0.17.3 filelock-3.6.0 h11-0.13.0 hyperlink-21.0.0 incremental-21.3.0 iniconfig-1.1.1 ipdb-0.13.9 ipython-7.33.0 itemadapter-0.5.0 itemloaders-1.0.4 jmespath-1.0.0 more-itertools-8.12.0 nose-1.3.7 outcome-1.1.0 parameterized-0.8.1 pdfminer.six-20220319 pluggy-1.0.0 protego-0.2.1 py-1.11.0 pyasn1-0.4.8 pyasn1-modules-0.2.8 pymysql-1.0.2 pyotp-2.6.0 pytest-7.1.2 pytest-forked-1.4.0 pytest-html-2.0.1 pytest-metadata-2.0.1 pytest-ordering-0.6 pytest-rerunfailures-10.2 pytest-xdist-2.5.0 queuelib-1.6.2 requests-file-1.5.1 rich-12.3.0 sbvirtualdisplay-1.1.0 scrapy-2.6.1 scrapy-selenium-0.0.7 selenium-4.1.3 seleniumbase-2.5.4.post1 service-identity-21.1.0 setuptools-scm-6.4.2 soupsieve-2.3.2.post1 tldextract-3.2.1 toml-0.10.2 trio-0.20.0 trio-websocket-0.9.2 wsproto-1.1.0 zope.interface-5.4.0
Proceed (Y/n)?
In [ ]:
# !pip install scrapy-selenium
!python scrapy/learn/awesome_selfhosted/spiders/scrapy_selenium.py
In [ ]:
!https://docs.scrapy.org/_/downloads/en/latest/pdf/
In [ ]:
import sys
!{sys.executable} -m pip install scrapy-pyppeteer
In [ ]:
!python scrapy/scrapy-linkedin/scrapy-linkedin/spiders/linkedin_login.py
In [13]:
!python scrapy/github/awesome/spiders/awesome_list.py
2021-10-16 00:29:31 [scrapy.utils.log] INFO: Scrapy 2.5.1 started (bot: scrapybot)
2021-10-16 00:29:31 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.9.7 | packaged by conda-forge | (default, Sep 23 2021, 07:28:37) - [GCC 9.4.0], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l 24 Aug 2021), cryptography 35.0.0, Platform Linux-5.4.0-88-generic-x86_64-with-glibc2.31
2021-10-16 00:29:31 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2021-10-16 00:29:31 [scrapy.crawler] INFO: Overridden settings:
{}
2021-10-16 00:29:31 [scrapy.extensions.telnet] INFO: Telnet Password: c3ecb039fe45254e
2021-10-16 00:29:31 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.memusage.MemoryUsage',
'scrapy.extensions.logstats.LogStats']
2021-10-16 00:29:31 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2021-10-16 00:29:31 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2021-10-16 00:29:31 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2021-10-16 00:29:31 [scrapy.core.engine] INFO: Spider opened
2021-10-16 00:29:31 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2021-10-16 00:29:31 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2021-10-16 00:29:31 [py.warnings] WARNING: /opt/conda/lib/python3.9/site-packages/scrapy_splash/request.py:42: ScrapyDeprecationWarning: Call to deprecated function to_native_str. Use to_unicode instead.
url = to_native_str(url)
2021-10-16 00:29:31 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://github.com/awesome-selfhosted/awesome-selfhosted> (referer: None)
2021-10-16 00:29:32 [scrapy.core.engine] INFO: Closing spider (finished)
2021-10-16 00:29:32 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 251,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 125130,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 1.484227,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2021, 10, 16, 0, 29, 32, 805803),
'httpcompression/response_bytes': 637769,
'httpcompression/response_count': 1,
'log_count/DEBUG': 1,
'log_count/INFO': 10,
'log_count/WARNING': 1,
'memusage/max': 61575168,
'memusage/startup': 61575168,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2021, 10, 16, 0, 29, 31, 321576)}
2021-10-16 00:29:32 [scrapy.core.engine] INFO: Spider closed (finished)
In [1]:
import requests
r = requests.get(
"http://splash:8050/render.json?url=http://domain.com/page-with-javascript.html&timeout=10&wait=0.5"
)
In [2]:
print(r.status)
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) /tmp/ipykernel_387/4275213714.py in <cell line: 1>() ----> 1 print(r.status) AttributeError: 'Response' object has no attribute 'status'
In [ ]:
!python scrapy/github/awesome_selfhosted/spiders/quotes.py
Scrapy Integration Testing¶
In [11]:
!python scrapy/github/awesome/spiders/splash_test.py
2021-10-16 00:25:35 [scrapy.utils.log] INFO: Scrapy 2.5.1 started (bot: scrapybot)
2021-10-16 00:25:35 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.9.7 | packaged by conda-forge | (default, Sep 23 2021, 07:28:37) - [GCC 9.4.0], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l 24 Aug 2021), cryptography 35.0.0, Platform Linux-5.4.0-88-generic-x86_64-with-glibc2.31
2021-10-16 00:25:35 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2021-10-16 00:25:35 [scrapy.crawler] INFO: Overridden settings:
{}
2021-10-16 00:25:35 [scrapy.extensions.telnet] INFO: Telnet Password: b8a082d3f20f441c
2021-10-16 00:25:35 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.memusage.MemoryUsage',
'scrapy.extensions.logstats.LogStats']
2021-10-16 00:25:35 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2021-10-16 00:25:35 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2021-10-16 00:25:35 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2021-10-16 00:25:35 [scrapy.core.engine] INFO: Spider opened
2021-10-16 00:25:35 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2021-10-16 00:25:36 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2021-10-16 00:25:36 [py.warnings] WARNING: /opt/conda/lib/python3.9/site-packages/scrapy_splash/request.py:42: ScrapyDeprecationWarning: Call to deprecated function to_native_str. Use to_unicode instead.
url = to_native_str(url)
2021-10-16 00:25:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://portfolio.donavanaldrich.com> (referer: None)
2021-10-16 00:25:36 [scrapy.core.scraper] ERROR: Spider error processing <GET https://portfolio.donavanaldrich.com> (referer: None)
Traceback (most recent call last):
File "/opt/conda/lib/python3.9/site-packages/twisted/internet/defer.py", line 858, in _runCallbacks
current.result = callback( # type: ignore[misc]
File "/home/jovyan/code/scrapy/github/awesome/spiders/splash_test.py", line 29, in parse
png_bytes = base64.b64decode(response.data['png'])
AttributeError: 'HtmlResponse' object has no attribute 'data'
2021-10-16 00:25:36 [scrapy.core.engine] INFO: Closing spider (finished)
2021-10-16 00:25:36 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 232,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 30168,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 0.689888,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2021, 10, 16, 0, 25, 36, 688102),
'httpcompression/response_bytes': 208226,
'httpcompression/response_count': 1,
'log_count/DEBUG': 1,
'log_count/ERROR': 1,
'log_count/INFO': 10,
'log_count/WARNING': 1,
'memusage/max': 59232256,
'memusage/startup': 59232256,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'spider_exceptions/AttributeError': 1,
'start_time': datetime.datetime(2021, 10, 16, 0, 25, 35, 998214)}
2021-10-16 00:25:36 [scrapy.core.engine] INFO: Spider closed (finished)
In [12]:
!python scrapy/github/awesome/spiders/selenium_test.py
2021-10-16 00:28:09 [scrapy.utils.log] INFO: Scrapy 2.5.1 started (bot: scrapybot)
2021-10-16 00:28:09 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.9.7 | packaged by conda-forge | (default, Sep 23 2021, 07:28:37) - [GCC 9.4.0], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l 24 Aug 2021), cryptography 35.0.0, Platform Linux-5.4.0-88-generic-x86_64-with-glibc2.31
2021-10-16 00:28:09 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2021-10-16 00:28:09 [scrapy.crawler] INFO: Overridden settings:
{}
2021-10-16 00:28:09 [scrapy.extensions.telnet] INFO: Telnet Password: 5715398da4f12340
2021-10-16 00:28:09 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.memusage.MemoryUsage',
'scrapy.extensions.logstats.LogStats']
2021-10-16 00:28:09 [scrapy.middleware] WARNING: Disabled SeleniumMiddleware: SELENIUM_DRIVER_NAME and SELENIUM_DRIVER_EXECUTABLE_PATH must be set
2021-10-16 00:28:09 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2021-10-16 00:28:09 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2021-10-16 00:28:09 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2021-10-16 00:28:09 [scrapy.core.engine] INFO: Spider opened
2021-10-16 00:28:09 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2021-10-16 00:28:10 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2021-10-16 00:28:10 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.google.com/> from <GET https://google.com>
2021-10-16 00:28:10 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.google.com/> (referer: None)
2021-10-16 00:28:10 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.google.com/> (referer: None)
Traceback (most recent call last):
File "/opt/conda/lib/python3.9/site-packages/twisted/internet/defer.py", line 858, in _runCallbacks
current.result = callback( # type: ignore[misc]
File "/home/jovyan/code/scrapy/github/awesome/spiders/selenium_test.py", line 29, in parse_result
print(response.request.meta['driver'].title)
KeyError: 'driver'
2021-10-16 00:28:10 [scrapy.core.engine] INFO: Closing spider (finished)
2021-10-16 00:28:10 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 432,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 7820,
'downloader/response_count': 2,
'downloader/response_status_count/200': 1,
'downloader/response_status_count/301': 1,
'elapsed_time_seconds': 0.602981,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2021, 10, 16, 0, 28, 10, 602166),
'httpcompression/response_bytes': 14104,
'httpcompression/response_count': 1,
'log_count/DEBUG': 2,
'log_count/ERROR': 1,
'log_count/INFO': 10,
'log_count/WARNING': 1,
'memusage/max': 56184832,
'memusage/startup': 56184832,
'response_received_count': 1,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'spider_exceptions/KeyError': 1,
'start_time': datetime.datetime(2021, 10, 16, 0, 28, 9, 999185)}
2021-10-16 00:28:10 [scrapy.core.engine] INFO: Spider closed (finished)
In [ ]:
from utils.browser.test import *
AnonymousBrowserTest()
!python utils/browser/test.py
Testing
In [ ]:
from utils.browser.network import *
chrome_options = webdriver.ChromeOptions()
# capabilities = browser_capabilities(module)
print("Webdriver Initializing")
driver = webdriver.Remote(
command_executor="http://192.168.1.101:4444/wd/hub",
options=chrome_options,
# desired_capabilities=capabilities,
)
log.info("Proxy IP info,,,")
proxy_ip("data/proxy_ip.json")
log.info("VPN Settings...")
vpn_settings("data/vpn.json")
log.info("Selenoid Status...")
selenoid_status("data/selenoid_status.json")
log.info("Real IP...")
ip_status(driver)
log.info("Browser Settings...")
browser_config(driver)
log.info("Random User Agent...")
# user_agent(driver)
# driver_object(driver)
# browser_feature(driver)
# uc_test()
driver.quit()
log.info("Session Terminated!")
log.info("test Finished")
# return
In [10]:
from linkedin_scraper import Person, actions
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
driver = webdriver.Remote(
command_executor="http://192.168.1.101:3148", options=chrome_options
)
email = "aldrich.don@gmail.com"
password = os.getenv(password)
actions.login(
driver, email, password
) # if email and password isnt given, it'll prompt in terminal
person = Person("https://www.linkedin.com/in/don-aldrich-48a51815", driver=driver)
print(person)
# driver.get("https://www.linkedin.com/in/don-aldrich-48a51815")
# driver.save_screenshot('./linkedin.png')
# driver.quit()
# driver = webdriver.Chrome()
--------------------------------------------------------------------------- WebDriverException Traceback (most recent call last) /tmp/ipykernel_111/2032127904.py in <module> 3 4 chrome_options = webdriver.ChromeOptions() ----> 5 driver = webdriver.Remote( 6 command_executor="http://192.168.1.101:3148", options=chrome_options 7 ) /opt/conda/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py in __init__(self, command_executor, desired_capabilities, browser_profile, proxy, keep_alive, file_detector, options) 155 warnings.warn("Please use FirefoxOptions to set browser profile", 156 DeprecationWarning, stacklevel=2) --> 157 self.start_session(capabilities, browser_profile) 158 self._switch_to = SwitchTo(self) 159 self._mobile = Mobile(self) /opt/conda/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py in start_session(self, capabilities, browser_profile) 250 parameters = {"capabilities": w3c_caps, 251 "desiredCapabilities": capabilities} --> 252 response = self.execute(Command.NEW_SESSION, parameters) 253 if 'sessionId' not in response: 254 response = response['value'] /opt/conda/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py in execute(self, driver_command, params) 319 response = self.command_executor.execute(driver_command, params) 320 if response: --> 321 self.error_handler.check_response(response) 322 response['value'] = self._unwrap_value( 323 response.get('value', None)) /opt/conda/lib/python3.9/site-packages/selenium/webdriver/remote/errorhandler.py in check_response(self, response) 206 if exception_class == ErrorInResponseException: 207 raise exception_class(response, value) --> 208 raise exception_class(value) 209 if message == "" and 'message' in value: 210 message = value['message'] WebDriverException: Message: <!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"> <title>Error</title> </head> <body> <pre>Cannot POST /session</pre> </body> </html>
In [ ]:
import sys
!{sys.executable} -m pip install linkedin_scraper
In [3]:
out = widgets.FloatProgress(
value=7.5,
min=0,
max=10.0,
step=0.1,
description="Loading:",
bar_style="info",
orientation="horizontal",
)
out
--------------------------------------------------------------------------- NameError Traceback (most recent call last) /tmp/ipykernel_387/914693869.py in <cell line: 1>() ----> 1 out = widgets.FloatProgress( 2 value=7.5, 3 min=0, 4 max=10.0, 5 step=0.1, NameError: name 'widgets' is not defined
In [4]:
out = widgets.Output(layout={"border": "1px solid black"})
out
--------------------------------------------------------------------------- NameError Traceback (most recent call last) /tmp/ipykernel_387/1437867774.py in <cell line: 1>() ----> 1 out = widgets.Output(layout={'border': '1px solid black'}) 2 out NameError: name 'widgets' is not defined
In [ ]: