o
    Mdh.                     @   s   d dl Z d dlmZ d dlmZ d dlmZmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlZd dlZd dlmZ d dlZd dlZeeZG d	d
 d
eZdS )    N)BaseCommand)default_storage)ChatbotFileRAGMetadata)	PdfReader)Document)SentenceTransformer)settingsc                   @   s\   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d ZdddZ	dddZ
dd Zdd ZdS )CommandzTProcesses PDFs, DOCX, and TXT files for a chatbot and creates RAG data with chunkingc                 C   s*   |j dtdd |j ddg ddd d S )	N
chatbot_idzID of the chatbot to process)typehelpz--chunk_strategysentence)r   fixed	paragraphzVChunking strategy: sentence (with paragraph awareness), fixed-size, or paragraph-based)defaultchoicesr   )add_argumentint)selfparser r   O/var/www/ai-chatbot-integration/chatbot/management/commands/process_rag_data.pyadd_arguments   s
   
zCommand.add_argumentsc           	   
   C   s   zGt |d7}t|}d}t|jdD ]\}}| pd}||d 7 }| jd| dt| d q|W  d   W S 1 s@w   Y  W dS  tyW } ztd	d}~ww )
z/Extract text from PDF with paragraph detection.rb    

zExtracted text from page z ( characters)NzFailed to extract text from PDF)	openr   	enumeratepagesextract_textstdoutwritelen	Exception)	r   	file_pathfpdftextpage_numpage	page_texter   r   r   extract_text_from_pdf   s    (zCommand.extract_text_from_pdfc              
   C   s~   z)t |}d}|jD ]}|j r||j d 7 }q
| jdt| d |W S  ty> } z	tdt| d}~ww )z0Extract text from DOCX with paragraph detection.r   r   zExtracted text from DOCX (r   z"Failed to extract text from DOCX: N)	r   
paragraphsr+   stripr$   r%   r&   r'   str)r   r(   docr+   parar/   r   r   r   extract_text_from_docx(   s   

zCommand.extract_text_from_docxc              
   C   s   z:t |ddd(}| }ddd |dD }| jdt| d |W  d	   W S 1 s3w   Y  W d	S  tyO } z	td
t| d	}~ww )zExtract text from TXT file.rzutf-8)encodingr   c                 s   s     | ]}|  r|  V  qd S )Nr2   ).0r5   r   r   r   	<genexpr>:   s    z0Command.extract_text_from_txt.<locals>.<genexpr>zExtracted text from TXT (r   Nz!Failed to extract text from TXT: )	r    readjoinsplitr$   r%   r&   r'   r3   )r   r(   r)   r+   r/   r   r   r   extract_text_from_txt5   s   (zCommand.extract_text_from_txtc                 C   sX   t j|d  }|dkr| |S |dkr| |S |dkr%| |S td| )z0Extract text from a file based on its extension.r   z.pdfz.docxz.txtzUnsupported file type: )ospathsplitextlowerr0   r6   r?   
ValueError)r   r(   file_extensionr   r   r   extract_text_from_file@   s   


zCommand.extract_text_from_file  P   c                 C   s  t d}dd |dD }g }d}| jdt| d |D ]}||}	g }
d}dd |	jD }|D ]k}t||}|| |kr|
rz|d	|
|d
| d |d7 }|rfd	|
| d d nd}|rn||gn|g}
t||| }q5|
| |d	|
|d
| d |d7 }g }
d}q5|
| ||7 }q5|
r|d	|
|d
| d |d7 }q!| jdt| d |S )z@Chunk text into sentences while respecting paragraph boundaries.en_core_web_smc                 S      g | ]
}|  r|  qS r   r9   r:   pr   r   r   
<listcomp>O       z-Command.sentence_chunking.<locals>.<listcomp>r   r   	Detected  paragraphsc                 S   s    g | ]}|j  r|j  qS r   )r+   r2   )r:   sentr   r   r   rM   Y         chunk_r+   indexidr   
   Nr   Created z sentence-based chunks)	spacyloadr>   r$   r%   r&   sentsappendr=   )r   r+   
max_tokensoverlap_tokensnlpr1   chunkschunk_indexr5   r4   current_chunkcurrent_tokens	sentencesrQ   sent_tokensoverlap_textr   r   r   sentence_chunkingL   sX   
 


zCommand.sentence_chunking  c                    s2    fddt dt D }dd t|D S )zFallback fixed-size chunking.c                    s   g | ]
}||   qS r   r   )r:   i
chunk_sizer+   r   r   rM      rN   z*Command.fixed_chunking.<locals>.<listcomp>r   c                 S   s"   g | ]\}}||d | dqS )rT   rU   r   )r:   rj   chunkr   r   r   rM      s    
)ranger&   r!   )r   r+   rl   ra   r   rk   r   fixed_chunking   s    zCommand.fixed_chunkingc                 C   sz   dd | dD }g }d}| jdt| d |D ]}|||d| d |d	7 }q| jd
t| d |S )zChunk text into paragraphs.c                 S   rJ   r   r9   rK   r   r   r   rM      rN   z.Command.paragraph_chunking.<locals>.<listcomp>r   r   rO   rP   rT   rU   r   rY   z paragraph-based chunks)r>   r$   r%   r&   r]   )r   r+   r1   ra   rb   r5   r   r   r   paragraph_chunking   s   
zCommand.paragraph_chunkingc                    s`  t   }|d  |d }ztjj d}| jjd|j d  ddd | j  W n  tjyJ   | jj| j	
d	  d
dd | j  Y d S w tjj|d}|sk| jj| j	d|j dd | j  d S tjttjd d}d  }z|j|d | jjd| dd | j  W n   | jjd| dd | j  Y z|j|d}	| jjd| dd | j  W n) ty }
 z| jj| j	
dt|
 dd | j  W Y d }
~
d S d }
~
ww | jjddd | j  td}| jjddd | j  |D ]| jjdjj dd | j  t   }tjjddid\}}ztjj}| jjd| dd | j  | |}| jjdt| ddd | j  | jjd| d dd | j  |d!kr| j|d"d#d$}n|d%kr| j|d&d'}n
|d(kr|  |}| jjd)t| d*dd | j  | jjd+dd | j  d,d- |D }|j!|d.d/}| jjd0t| d1dd | j  d2d- |D } fd3d-|D }|	j"|||fd4d-|D d5 | jjd6t| d7dd | j  t||_#d8|_$|%  | jjd9jj dd | j  t   | }| jjd:|d;d<dd | j  W q
 ty }
 z=d=|_$t|
|_&|%  | jj| j	
d>jj d?t|
 dd | j  t'(d>jj d?t|
  W Y d }
~
q
d }
~
ww t   | }| jj| j	)d@|j dA|d;d<dd | j  d S )BNr   chunk_strategyrW   z%Starting RAG processing for chatbot: z (ID: )
)endingzChatbot with ID z does not exist)chatbotzNo files found for chatbot 	chroma_db)rA   chatbot_)namezCleared existing collection: z!No existing collection to clear: zInitialized Chroma collection: z(Failed to initialize Chroma collection: z&Loading sentence-transformers model...zall-MiniLM-L6-v2z#Embedding model loaded successfullyz
Processing file: status
PROCESSING)filedefaultszReading file from: zTotal text extracted: z characterszChunking text using z strategy...r   rG   rH   )r^   r_   r   ri   )rl   r   rY   z chunkszGenerating embeddings...c                 S      g | ]}|d  qS )r+   r   r:   rm   r   r   r   rM          z"Command.handle.<locals>.<listcomp>F)show_progress_barz
Generated z embeddingsc                 S   r~   rr   r   r   r   r   r   rM      r   c                    s$   g | ]} j |d  |d dqS )rV   r+   )r   file_idrb   r+   rr   r   r   file_objr   r   rM      s    
c                    s    g | ]} j  d |d  qS )_rV   rr   r   )r   r   r   rM      rR   )	documents
embeddings	metadatasidszStored z chunks in Chroma collection	COMPLETEDzUpdated RAG metadata for file: zFile processing completed in z.2fz secondsFAILEDzFailed to process file z: z&
RAG processing completed for chatbot z in )*timer   objectsgetr$   r%   ry   flushDoesNotExiststyleERRORr   filterWARNINGchromadbPersistentClientr3   r
   
MEDIA_ROOTdelete_collectionget_or_create_collectionr'   r	   r|   r   get_or_creater   rA   rF   r&   rh   ro   rp   encodeaddchunk_countrz   saveerror_messageloggererrorSUCCESS)r   argsoptions
start_timerq   rv   fileschroma_clientcollection_name
collectionr/   embedderfile_start_timerag_metadatacreatedr(   r+   ra   chunk_textsr   r   r   file_durationtotal_durationr   r   r   handle   s     
 

"




















,
	*zCommand.handleN)rG   rH   )ri   )__name__
__module____qualname__r   r   r0   r6   r?   rF   rh   ro   rp   r   r   r   r   r   r      s    

5	r   )r   django.core.management.baser   django.core.files.storager   chatbot.modelsr   r   r   PyPDF2r   docxr   sentence_transformersr	   r   r@   django.confr
   rZ   logging	getLoggerr   r   r   r   r   r   r   <module>   s    
