diff --git a/docker-compose.yaml.example b/docker-compose.yaml.example index ac07757..0fce497 100644 --- a/docker-compose.yaml.example +++ b/docker-compose.yaml.example @@ -10,11 +10,25 @@ services: ports: - 127.0.0.1:8080:80 depends_on: + - instagram-api - quotes-api - twitch-cache-api - twitch-subs-api - cms + # Instagram API service with /data/instagram mounted as database storage + # INSTAGRAM_USERNAME is needed for synchronization + instagram-api: + build: + context: ./instagram-api + volumes: + - /data/instagram:/instagram + environment: + - SQLALCHEMY_DATABASE_URI=sqlite:////instagram/instagram.db + - INSTAGRAM_USERNAME=__INSTAGRAM_USERNAME__ + expose: + - 5000 + # Quotes API service with /data/quotes mounted as database storage # SECRET_KEY is needed for API key validation quotes-api: diff --git a/instagram-api/.gitignore b/instagram-api/.gitignore new file mode 100644 index 0000000..6a18ad4 --- /dev/null +++ b/instagram-api/.gitignore @@ -0,0 +1,96 @@ +# ---> Python +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject + +# Rope project settings +.ropeproject + diff --git a/instagram-api/Dockerfile b/instagram-api/Dockerfile new file mode 100644 index 0000000..3e9833f --- /dev/null +++ b/instagram-api/Dockerfile @@ -0,0 +1,14 @@ +FROM python:alpine + +WORKDIR /app +COPY . . + +RUN pip install --no-cache-dir --requirement requirements.txt + +RUN addgroup -g 9999 lilia + +EXPOSE 5000 + +USER nobody:lilia + +ENTRYPOINT ["python", "app.py"] diff --git a/instagram-api/app.py b/instagram-api/app.py new file mode 100644 index 0000000..391d895 --- /dev/null +++ b/instagram-api/app.py @@ -0,0 +1,116 @@ +import logging +import os + +import flask +import flask_apscheduler +import flask_restful +import flask_restful.fields +import flask_restful.reqparse +import sqlalchemy +import sqlalchemy.engine + +from db import db, Medium + + +app = flask.Flask(__name__) +app.logger.setLevel(logging.INFO) +app.config.update( + ERROR_404_HELP=False, + SQLALCHEMY_TRACK_MODIFICATIONS=False, + SQLALCHEMY_DATABASE_URI=os.getenv('SQLALCHEMY_DATABASE_URI'), + SCHEDULER_TIMEZONE='UTC', + SCHEDULER_JOBS=[ + dict(id='sync_media', + func='sync:Sync.sync_media', + args=(app, db), + max_instances=1, + trigger='interval', + seconds=300)]) + +if app.config.get('SQLALCHEMY_DATABASE_URI', '').startswith('sqlite://'): + @sqlalchemy.event.listens_for(sqlalchemy.engine.Engine, 'connect') + def set_sqlite_pragma(dbapi_connection, connection_record): + dbapi_connection.execute('PRAGMA journal_mode=WAL') + dbapi_connection.execute('PRAGMA synchronous=NORMAL') + +db.init_app(app) +db.create_all(app=app) + +scheduler = flask_apscheduler.APScheduler() +scheduler.init_app(app) + +api = flask_restful.Api(app) + + +medium_fields = { + 'id': flask_restful.fields.Integer(), + 'typename': flask_restful.fields.String(), + 'caption': flask_restful.fields.String(), + 'shortcode': flask_restful.fields.String(), + 'taken_at': flask_restful.fields.DateTime(dt_format='iso8601'), + 'width': flask_restful.fields.Integer(), + 'height': flask_restful.fields.Integer(), + 'display_url': flask_restful.fields.String(), + 'thumbnail_url': flask_restful.fields.String(), + 'likes': flask_restful.fields.Integer(), + 'owner_id': flask_restful.fields.Integer(), + 'owner_username': flask_restful.fields.String(), + 'owner_profile_pic_url': flask_restful.fields.String(), +} + + +filter_parser = flask_restful.reqparse.RequestParser() +filter_parser.add_argument('filter', type=str) +filter_parser.add_argument('type', type=str) +filter_parser.add_argument('sort_by', type=str) +filter_parser.add_argument('sort_order', type=str) +filter_parser.add_argument('page_number', type=int) +filter_parser.add_argument('page_size', type=int) + + +class MediumResource(flask_restful.Resource): + @flask_restful.marshal_with(medium_fields) + def get(self, id): + q = db.session.query(Medium).filter(Medium.id == id) + medium = q.first() + if not medium: + flask_restful.abort(404, message='Medium {0} does not exist'.format(id)) + return medium, 200 + + +class MediaResource(flask_restful.Resource): + @flask_restful.marshal_with(medium_fields) + def get(self): + args = filter_parser.parse_args() + q = db.session.query(Medium) + if args['filter']: + q = q.filter(Medium.caption.ilike('%{}%'.format(args['filter']))) + if args['type']: + q = q.filter(Medium.typename == args['type']) + count = q.count() + if args['sort_order'] == 'random': + q = q.order_by(sqlalchemy.func.random()) + elif args['sort_by']: + col = getattr(Medium, args['sort_by'], None) + if col: + if args['sort_order']: + order_by = getattr(col, args['sort_order'], None) + if order_by: + q = q.order_by(order_by()) + else: + q = q.order_by(col) + if args['page_size']: + q = q.limit(args['page_size']) + if args['page_number'] and args['page_size']: + q = q.offset(args['page_number'] * args['page_size']) + media = q.all() + return media, 200, {'X-Total-Count': count} + + +api.add_resource(MediumResource, '/media/') +api.add_resource(MediaResource, '/media') + + +if __name__ == '__main__': + scheduler.start() + app.run(host='0.0.0.0', threaded=True, debug=False) diff --git a/instagram-api/db.py b/instagram-api/db.py new file mode 100644 index 0000000..da78c8f --- /dev/null +++ b/instagram-api/db.py @@ -0,0 +1,22 @@ +import flask_sqlalchemy + + +db = flask_sqlalchemy.SQLAlchemy(session_options=dict(autoflush=False)) + + +class Medium(db.Model): + __tablename__ = 'media' + + id = db.Column(db.Integer, primary_key=True) + typename = db.Column(db.String) + caption = db.Column(db.String) + shortcode = db.Column(db.String) + taken_at = db.Column(db.DateTime) + width = db.Column(db.Integer) + height = db.Column(db.Integer) + display_url = db.Column(db.String) + thumbnail_url = db.Column(db.String) + likes = db.Column(db.Integer) + owner_id = db.Column(db.Integer) + owner_username = db.Column(db.String) + owner_profile_pic_url = db.Column(db.String) diff --git a/instagram-api/instagram.py b/instagram-api/instagram.py new file mode 100644 index 0000000..38d2446 --- /dev/null +++ b/instagram-api/instagram.py @@ -0,0 +1,69 @@ +import hashlib +import json +import re + +from requests_futures.sessions import FuturesSession + + +BASE_URL = 'https://www.instagram.com' +QUERY_HASH = '42323d64886122307be10013ad2dcc44' +SHARED_DATA = re.compile(r'window\._sharedData = (\{.*\});') + + +class Instagram(object): + def __init__(self, username): + self.username = username + shared_data = self._get_shared_data() + try: + graphql = shared_data['entry_data']['ProfilePage'][0]['graphql'] + self.user = {k: v for k, v in graphql['user'].items() if not k.startswith('edge')} + self.rhx_gis = shared_data['rhx_gis'] + except (IndexError, KeyError, TypeError): + self.user = {} + self.rhx_gis = None + + def _get_shared_data(self): + session = FuturesSession() + r = session.get('{0}/{1}'.format(BASE_URL, self.username)).result() + if not r.ok: + return None + m = SHARED_DATA.search(r.text) + if m: + return json.loads(m.group(1)) + return None + + def fetch_media(self): + session = FuturesSession() + def get_media(count, cursor): + variables = json.dumps(dict( + id=self.user.get('id'), + first=count, + after=cursor)) + gis = '{0}:{1}'.format(self.rhx_gis, variables) + gis = hashlib.md5(gis.encode('UTF-8')).hexdigest() + url = '{0}/graphql/query'.format(BASE_URL) + params = dict( + query_hash=QUERY_HASH, + variables=variables) + return session.get(url, params=params, headers={'X-Instagram-GIS': gis}) + result = [] + count = 50 + cursor = '' + while True: + request = get_media(count, cursor) + r = request.result() + if not r.ok: + return [] + data = r.json() + try: + data = data['data']['user']['edge_owner_to_timeline_media'] + except KeyError: + return [] + for edge in data.get('edges', []): + node = edge.get('node', {}) + node['owner'].update(self.user) + result.append(node) + cursor = data['page_info']['end_cursor'] + if not cursor: + break + return result diff --git a/instagram-api/requirements.txt b/instagram-api/requirements.txt new file mode 100644 index 0000000..eeacc1c --- /dev/null +++ b/instagram-api/requirements.txt @@ -0,0 +1,5 @@ +Flask +Flask-APScheduler +Flask-RESTful +Flask-SQLAlchemy +requests-futures diff --git a/instagram-api/sync.py b/instagram-api/sync.py new file mode 100644 index 0000000..1bcd983 --- /dev/null +++ b/instagram-api/sync.py @@ -0,0 +1,58 @@ +import datetime +import os + +from db import Medium +from instagram import Instagram + + +class Sync(object): + @staticmethod + def _get(d, *keys, default=None): + try: + result = None + for key in keys: + if result: + if isinstance(result, list): + result = result[key] + else: + result = result.get(key, default) + else: + result = d.get(key, default) + return result + except (KeyError, IndexError): + return default + + @staticmethod + def _to_datetime(val): + if not val: + return None + return datetime.datetime.utcfromtimestamp(val) + + @classmethod + def sync_media(cls, app, db): + app.logger.info('Starting synchronization of media') + with app.app_context(): + instagram = Instagram(os.getenv('INSTAGRAM_USERNAME')) + for med in instagram.fetch_media(): + id = cls._get(med, 'id') + if not id: + continue + q = db.session.query(Medium).filter(Medium.id == id) + medium = q.first() + if not medium: + medium = Medium(id=id) + medium.typename = cls._get(med, '__typename') + medium.caption = cls._get(med, 'edge_media_to_caption', 'edges', 0, 'node', 'text') + medium.shortcode = cls._get(med, 'shortcode') + medium.taken_at = cls._to_datetime(cls._get(med, 'taken_at_timestamp')) + medium.width = cls._get(med, 'dimensions', 'width') + medium.height = cls._get(med, 'dimensions', 'height') + medium.display_url = cls._get(med, 'display_url') + medium.thumbnail_url = cls._get(med, 'thumbnail_src') + medium.likes = cls._get(med, 'edge_media_preview_like', 'count') + medium.owner_id = cls._get(med, 'owner', 'id') + medium.owner_username = cls._get(med, 'owner', 'username') + medium.owner_profile_pic_url = cls._get(med, 'owner', 'profile_pic_url') + db.session.add(medium) + db.session.commit() + app.logger.info('Synchronization of media completed') diff --git a/nginx/nginx.conf b/nginx/nginx.conf index 7cff13a..802b733 100644 --- a/nginx/nginx.conf +++ b/nginx/nginx.conf @@ -68,6 +68,15 @@ http { root /twitch-logs; } + location ^~ /instagram/api/ { + rewrite ^/instagram/api(/.*)$ $1 break; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_http_version 1.1; + tcp_nodelay on; + proxy_pass http://instagram-api:5000/; + } + location ^~ /quotes/api/ { rewrite ^/quotes/api(/.*)$ $1 break; proxy_set_header Host $host;