This project dataset is from Kaggle; it contains all the metadata on Netflix for TV shows and movies. The project is to simulate Real-time streaming for movie details using Kafka. We used different technologies such as Python, Amazon EC2, Apache Kafka, Glue, Athena, and SQL.


wget https://downloads.apache.org/kafka/3.7.0/kafka_2.13-3.7.0.tgz
tar -xvf kafka_2.13-3.7.0.tgz
sudo yum install java-1.8.0
java -version
sudo nano config/server.properties
bin/zookeeper-server-start.sh config/zookeeper.properties
export KAFKA_HEAP_OPTS="-Xmx256M -Xms128M"
cd kafka_2.13-3.7.0
bin/kafka-server-start.sh config/server.properties
bin/kafka-topics.sh --create --topic netflix_data --bootstrap-server {Put the Public IP of your EC2 Instance:9092} --replication-factor 1 --partitions 1

cd kafka_2.13-3.7.0
bin/kafka-console-producer.sh --topic netflix_data --bootstrap-server {Put the Public IP of your EC2 Instance:9092}
cd kafka_2.13-3.7.0
bin/kafka-console-consumer.sh --topic netflix_data --bootstrap-server {Put the Public IP of your EC2 Instance:9092}










SELECT * FROM "netflix_movies_db"."gakas_kafka_netflix_data" WHERE release_year=2020;


SELECT type,count(*) FROM "netflix_movies_db"."gakas_kafka_netflix_data" Group BY type;
