Ctrl →

sasholy · Авг. 21, 2018 17:10:28

Здравствуйте. Кто может отредактировать код, что бы можно было сохранять прокси в текстовый файл для дальнейшей работы, нужны https ip, нужны ли здесь какие либо проверки типа if, els, ну или может как улучшить код ? Благодарю.

 import requests
from bs4 import BeautifulSoup
from random import choice
def get_proxy():
    html = requests.get('https://free-proxy-list.net/').text
    soup = BeautifulSoup(html, 'lxml')
    trs = soup.find('table', id='proxylisttable').find_all('tr')[1:11]
    proxies = []
    for tr in trs:
        tds = tr.find_all('td')
        ip = tds[0].text.strip()
        port = tds[1].text.strip()
        schema = 'https' if 'yes' in tds[6].text.strip() else 'http'
        proxy = {'schema': schema, 'address': ip + ':' + port}
        proxies.append(proxy)
    return choice(proxies)
def get_html(url):
    # proxies = {'https': 'ipaddress:5000'}
    p = get_proxy() # {'schema': '', 'address': ''}
    proxy = { p['schema']: p['address']  }
    r = requests.get(url, proxies=proxy, timeout=5)
    return r.json()['origin']
def main():
    url = 'http://checkip.dyndns.org'
    print(get_html(url))
if __name__ == '__main__':
    main()

Отредактировано sasholy (Авг. 21, 2018 17:13:53)

sosok43k · Авг. 21, 2018 22:27:04

Могу, помочь, обращайтесь

Vigi · Авг. 22, 2018 08:59:44

 import requests
from bs4 import BeautifulSoup as bs
html = requests.get('https://free-proxy-list.net/').text
soup = bs(html, 'lxml')
res = soup.find('div', {'class': 'table-responsive'}).find_all('td')
fl = 0
with open('out.txt', 'w', encoding='utf-8') as f:
    for i in res:
        print(i.text, file=f)
        fl += 1
        if fl == 8:
            print(file=f)
            fl = 0

sasholy · Авг. 22, 2018 10:28:02

Благодарю, не могли бы Вы допилить, что бы только https выбирало, можно было указать сколько надо проксей 20, 50, 150 и записывало в формате - ip:port?

  import requests
from bs4 import BeautifulSoup as bs
html = requests.get('https://free-proxy-list.net/').text
soup = bs(html, 'lxml')
res = soup.find('div', {'class': 'table-responsive'}).find_all('td')
fl = 0
with open('out.txt', 'w', encoding='utf-8') as f:
    for i in res:
        print(i.text, file=f)
        fl += 1
        if fl == 8:
            print(file=f)
            fl = 0

Отредактировано sasholy (Авг. 22, 2018 10:32:43)

Vigi · Авг. 22, 2018 13:02:00

 import requests
from bs4 import BeautifulSoup as bs
n = int(input(':>'))
html = requests.get('https://free-proxy-list.net/').text
soup = bs(html, 'lxml')
res = soup.find('tbody').find_all('tr')
with open('out.txt', 'w', encoding='utf-8') as f:
    for i in res[:n]:
        ip = i.find_all('td')[0].text
        port = i.find_all('td')[1].text
        print(ip, port, sep=':', file=f)

?

Vigi · Авг. 22, 2018 13:49:22

сори, про https забыл вот:

 import requests
from bs4 import BeautifulSoup as bs
n = int(input(':>'))
html = requests.get('https://free-proxy-list.net/').text
soup = bs(html, 'lxml')
res = soup.find('tbody').find_all('tr')
with open('out.txt', 'w', encoding='utf-8') as f:
    for i in res[:n]:
        if i.find_all('td')[6].text == 'yes':
            ip = i.find_all('td')[0].text
            port = i.find_all('td')[1].text
            print(ip, port, sep=':', file=f)

sasholy · Авг. 22, 2018 14:08:04

Vigi

куда вписывать число которое мне надо(50 - 100 прокси ip)?
не шарю в этом, учусь.

Отредактировано sasholy (Авг. 22, 2018 14:09:33)

Vigi · Авг. 23, 2018 08:46:31

Тут:

Отредактировано Vigi (Авг. 23, 2018 08:49:44)

sasholy · Авг. 29, 2018 13:48:17

Vigi
?

Как сделать, что бы работало через мой прокси лист и юзер агент в этом коде? если возможно улучшить код. Куда и какой код вставлять с моими txt списками? И как проверять и отбрасывать нерабочие прокси, что бы работало без пропусков запросов(сразу подменяло рабочим прокси)?

 import requests
from bs4 import BeautifulSoup
from random import choice, uniform
from time import sleep
import csv
def get_html(url):
    r = requests.get(url)
    return r.text
def write_csv(data):
    with open('cmc.csv', 'a') as f:
        writer = csv.writer(f)
        writer.writerow([data['name'],
                         data['price']])
def get_page_data(html):
    soup = BeautifulSoup(html, 'lxml')
    trs = soup.find('table', id='currencies').find('tbody').find_all('tr')
    for tr in trs:
        tds = tr.find_all('td')
        name = tds[1].find('a', class_='currency-name-container').text
        price = tds[3].find('a').get('data-usd')
        data = {'name': name,
                'price': price}
        write_csv(data)
def main():
    urla = 'https://coinmarketcap.com/{}'
    for i in range(0, 4):
        url = urla.format(str(i))
        get_page_data(get_html(url))
if __name__ == '__main__':
    main()

Отредактировано sasholy (Авг. 29, 2018 16:04:56)

sasholy · Сен. 6, 2018 16:40:20

Что нужно сделать, не работает, сайт блокирует?

 import requests
from bs4 import BeautifulSoup
from random import choice, uniform
from time import sleep
import csv
def get_html(url, useragent=None, proxy=None):
    r = requests.get(url, headers=useragent, proxies=proxy)
    return r.text
def write_csv(data):
    with open('cmc.csv', 'a') as f:
        writer = csv.writer(f)
        writer.writerow([data['name'],
                         data['price']])
def get_page_data(html):
    soup = BeautifulSoup(html, 'lxml')
    trs = soup.find('table', id='currencies').find('tbody').find_all('tr')
    for tr in trs:
        tds = tr.find_all('td')
        name = tds[1].find('a', class_='currency-name-container').text
        price = tds[3].find('a').get('price')
        data = {'name': name,
                'price': price}
        write_csv(data)
def main():
    url = 'https://coinmarketcap.com/'
    useragents = open('useragents.txt').read().split('\n')
    proxies = open('proxies.txt').read().split('\n')
    for i in range(3):
        sleep(uniform(3, 6))
        proxy = {'http': 'http://' + choice(proxies)}
        useragent = {'User-Agent': choice(useragents)}
        try:
            html = get_html(url, useragent, proxy)
        except:
            continue
        get_page_data(get_html(url))
if __name__ == '__main__':
    main()

Отредактировано sasholy (Сен. 11, 2018 14:51:03)

Python-сообщество

Уведомления

#1 Авг. 21, 2018 17:10:28

Спарсить прокси в тхт

#2 Авг. 21, 2018 22:27:04

Спарсить прокси в тхт

#3 Авг. 22, 2018 08:59:44

Спарсить прокси в тхт

#4 Авг. 22, 2018 10:28:02

Спарсить прокси в тхт

#5 Авг. 22, 2018 13:02:00

Спарсить прокси в тхт

#6 Авг. 22, 2018 13:49:22

Спарсить прокси в тхт

#7 Авг. 22, 2018 14:08:04

Спарсить прокси в тхт

#8 Авг. 23, 2018 08:46:31

Спарсить прокси в тхт

#9 Авг. 29, 2018 13:48:17

Спарсить прокси в тхт

#10 Сен. 6, 2018 16:40:20

Спарсить прокси в тхт

Board footer