xref: /haiku/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp (revision 93a78ecaa45114d68952d08c4778f073515102f2)
1 /*
2  * Copyright 2006-2007, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Andrew Galante, haiku.galante@gmail.com
7  *		Axel Dörfler, axeld@pinc-software.de
8  *		Hugo Santos, hugosantos@gmail.com
9  */
10 
11 
12 #include "TCPEndpoint.h"
13 #include "EndpointManager.h"
14 
15 #include <net_buffer.h>
16 #include <net_datalink.h>
17 #include <net_stat.h>
18 #include <NetBufferUtilities.h>
19 #include <NetUtilities.h>
20 
21 #include <lock.h>
22 #include <util/AutoLock.h>
23 #include <util/khash.h>
24 #include <util/list.h>
25 
26 #include <KernelExport.h>
27 
28 #include <netinet/in.h>
29 #include <netinet/ip.h>
30 #include <netinet/tcp.h>
31 #include <new>
32 #include <stdlib.h>
33 #include <string.h>
34 
35 
36 // References:
37 //  - RFC 793 - Transmission Control Protocol
38 //  - RFC 813 - Window and Acknowledgement Strategy in TCP
39 //
40 // Things this implementation currently doesn't implement:
41 //
42 // TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery, RFC 2001, RFC 2581, RFC 3042
43 // NewReno Modification to TCP's Fast Recovery, RFC 2582
44 // Explicit Congestion Notification (ECN), RFC 3168
45 // SYN-Cache
46 // TCP Extensions for High Performance, RFC 1323
47 // SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517
48 // Forward RTO-Recovery, RFC 4138
49 
50 #define PrintAddress(address) \
51 	AddressString(Domain(), address, true).Data()
52 
53 //#define TRACE_TCP
54 //#define PROBE_TCP
55 
56 #ifdef TRACE_TCP
57 // the space before ', ##args' is important in order for this to work with cpp 2.95
58 #	define TRACE(format, args...)	dprintf("TCP [%llu] %p (%12s) " format "\n", \
59 		system_time(), this, name_for_state(fState) , ##args)
60 #else
61 #	define TRACE(args...)			do { } while (0)
62 #endif
63 
64 #ifdef PROBE_TCP
65 #	define PROBE(buffer, window) \
66 	dprintf("TCP PROBE %llu %s %s %ld %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu\n", \
67 		system_time(), PrintAddress(buffer->source), \
68 		PrintAddress(buffer->destination), buffer->size, (uint32)fSendNext, \
69 		(uint32)fSendUnacknowledged, fCongestionWindow, fSlowStartThreshold, \
70 		window, fSendWindow, (uint32)(fSendMax - fSendUnacknowledged), \
71 		fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout)
72 #else
73 #	define PROBE(buffer, window)	do { } while (0)
74 #endif
75 
76 // Initial estimate for packet round trip time (RTT)
77 #define TCP_INITIAL_RTT		2000000
78 
79 // constants for the fFlags field
80 enum {
81 	FLAG_OPTION_WINDOW_SCALE	= 0x01,
82 	FLAG_OPTION_TIMESTAMP		= 0x02,
83 	// TODO: Should FLAG_NO_RECEIVE apply as well to received connections?
84 	//       That is, what is expected from accept() after a shutdown()
85 	//       is performed on a listen()ing socket.
86 	FLAG_NO_RECEIVE				= 0x04,
87 };
88 
89 
90 static const int kTimestampFactor = 1024;
91 
92 
93 static inline bigtime_t
94 absolute_timeout(bigtime_t timeout)
95 {
96 	if (timeout == 0 || timeout == B_INFINITE_TIMEOUT)
97 		return timeout;
98 
99 	return timeout + system_time();
100 }
101 
102 
103 static inline status_t
104 posix_error(status_t error)
105 {
106 	if (error == B_TIMED_OUT)
107 		return B_WOULD_BLOCK;
108 
109 	return error;
110 }
111 
112 
113 static inline bool
114 in_window(const tcp_sequence &sequence, const tcp_sequence &rcvNext,
115 	uint32 rcvWindow)
116 {
117 	return sequence >= rcvNext && sequence < (rcvNext + rcvWindow);
118 }
119 
120 
121 static inline bool
122 segment_in_sequence(const tcp_segment_header &segment, int size,
123 	const tcp_sequence &rcvNext, uint32 rcvWindow)
124 {
125 	tcp_sequence sequence(segment.sequence);
126 	if (size == 0) {
127 		if (rcvWindow == 0)
128 			return sequence == rcvNext;
129 		return in_window(sequence, rcvNext, rcvWindow);
130 	} else {
131 		if (rcvWindow == 0)
132 			return false;
133 		return in_window(sequence, rcvNext, rcvWindow)
134 			|| in_window(sequence + size - 1, rcvNext, rcvWindow);
135 	}
136 }
137 
138 
139 static inline bool
140 is_writable(tcp_state state)
141 {
142 	return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED
143 		|| state == ESTABLISHED || state == FINISH_RECEIVED;
144 }
145 
146 
147 static inline uint32 tcp_now()
148 {
149 	return system_time() / kTimestampFactor;
150 }
151 
152 
153 static inline uint32 tcp_diff_timestamp(uint32 base)
154 {
155 	uint32 now = tcp_now();
156 
157 	if (now > base)
158 		return now - base;
159 
160 	return now + UINT_MAX - base;
161 }
162 
163 
164 static inline bool
165 state_needs_finish(int32 state)
166 {
167 	return state == WAIT_FOR_FINISH_ACKNOWLEDGE
168 		|| state == FINISH_SENT || state == CLOSING;
169 }
170 
171 
172 WaitList::WaitList(const char *name)
173 {
174 	fCondition = 0;
175 	fSem = create_sem(0, name);
176 }
177 
178 
179 WaitList::~WaitList()
180 {
181 	delete_sem(fSem);
182 }
183 
184 
185 status_t
186 WaitList::InitCheck() const
187 {
188 	return fSem;
189 }
190 
191 
192 status_t
193 WaitList::Wait(MutexLocker &locker, bigtime_t timeout, bool wakeNext)
194 {
195 	locker.Unlock();
196 
197 	status_t status = B_OK;
198 
199 	while (status == B_OK && !atomic_test_and_set(&fCondition, 0, 1))
200 		status = acquire_sem_etc(fSem, 1, B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT,
201 			timeout);
202 
203 	locker.Lock();
204 	if (status == B_OK && wakeNext)
205 		Signal();
206 
207 	return status;
208 }
209 
210 
211 void
212 WaitList::Signal()
213 {
214 	atomic_or(&fCondition, 1);
215 	release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE | B_RELEASE_IF_WAITING_ONLY);
216 }
217 
218 
219 TCPEndpoint::TCPEndpoint(net_socket *socket)
220 	:
221 	ProtocolSocket(socket),
222 	fManager(NULL),
223 	fReceiveList("tcp receive"),
224 	fSendList("tcp send"),
225 	fOptions(0),
226 	fSendWindowShift(0),
227 	fReceiveWindowShift(0),
228 	fSendUnacknowledged(0),
229 	fSendNext(0),
230 	fSendMax(0),
231 	fSendWindow(0),
232 	fSendMaxWindow(0),
233 	fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
234 	fSendQueue(socket->send.buffer_size),
235 	fInitialSendSequence(0),
236 	fDuplicateAcknowledgeCount(0),
237 	fRoute(NULL),
238 	fReceiveNext(0),
239 	fReceiveMaxAdvertised(0),
240 	fReceiveWindow(socket->receive.buffer_size),
241 	fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
242 	fReceiveQueue(socket->receive.buffer_size),
243 	fRoundTripTime(TCP_INITIAL_RTT / kTimestampFactor),
244 	fRoundTripDeviation(TCP_INITIAL_RTT / kTimestampFactor),
245 	fRetransmitTimeout(TCP_INITIAL_RTT),
246 	fReceivedTimestamp(0),
247 	fCongestionWindow(0),
248 	fSlowStartThreshold(0),
249 	fState(CLOSED),
250 	fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP),
251 	fError(B_OK)
252 {
253 	//gStackModule->init_timer(&fTimer, _TimeWait, this);
254 
255 	// TODO: to be replaced with a real locking strategy!
256 	mutex_init(&fLock, "tcp lock");
257 
258 	gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this);
259 	gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer, this);
260 	gStackModule->init_timer(&fDelayedAcknowledgeTimer,
261 		TCPEndpoint::_DelayedAcknowledgeTimer, this);
262 	gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer, this);
263 }
264 
265 
266 TCPEndpoint::~TCPEndpoint()
267 {
268 	mutex_lock(&fLock);
269 
270 	gStackModule->cancel_timer(&fRetransmitTimer);
271 	gStackModule->cancel_timer(&fPersistTimer);
272 	gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
273 	gStackModule->cancel_timer(&fTimeWaitTimer);
274 
275 	if (fManager) {
276 		fManager->Unbind(this);
277 		return_endpoint_manager(fManager);
278 	}
279 
280 	mutex_destroy(&fLock);
281 }
282 
283 
284 status_t
285 TCPEndpoint::InitCheck() const
286 {
287 	if (fLock.sem < B_OK)
288 		return fLock.sem;
289 
290 	if (fReceiveList.InitCheck() < B_OK)
291 		return fReceiveList.InitCheck();
292 
293 	if (fSendList.InitCheck() < B_OK)
294 		return fSendList.InitCheck();
295 
296 	return B_OK;
297 }
298 
299 
300 //	#pragma mark - protocol API
301 
302 
303 status_t
304 TCPEndpoint::Open()
305 {
306 	TRACE("Open()");
307 
308 	status_t status = ProtocolSocket::Open();
309 	if (status < B_OK)
310 		return status;
311 
312 	fManager = create_endpoint_manager(Domain());
313 	if (fManager == NULL)
314 		return EAFNOSUPPORT;
315 
316 	return B_OK;
317 }
318 
319 
320 status_t
321 TCPEndpoint::Close()
322 {
323 	TRACE("Close()");
324 
325 	MutexLocker lock(fLock);
326 
327 	if (fState == LISTEN)
328 		delete_sem(fAcceptSemaphore);
329 
330 	if (fState == SYNCHRONIZE_SENT || fState == LISTEN) {
331 		fState = CLOSED;
332 		return B_OK;
333 	}
334 
335 	status_t status = _ShutdownEgress(true);
336 	if (status != B_OK)
337 		return status;
338 
339 	if (socket->options & SO_LINGER) {
340 		TRACE("Close(): Lingering for %i secs", socket->linger);
341 
342 		bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL);
343 
344 		while (fSendQueue.Used() > 0) {
345 			status = fSendList.Wait(lock, maximum);
346 			if (status == B_TIMED_OUT || status == B_WOULD_BLOCK)
347 				break;
348 			else if (status < B_OK)
349 				return status;
350 		}
351 
352 		TRACE("Close(): after waiting, the SendQ was left with %lu bytes.",
353 			fSendQueue.Used());
354 	}
355 
356 	// TODO: do i need to wait until fState returns to CLOSED?
357 	return B_OK;
358 }
359 
360 
361 status_t
362 TCPEndpoint::Free()
363 {
364 	TRACE("Free()");
365 
366 	MutexLocker _(fLock);
367 
368 	if (fState <= SYNCHRONIZE_SENT || fState == TIME_WAIT)
369 		return B_OK;
370 
371 	// we are only interested in the timer, not in changing state
372 	_EnterTimeWait();
373 	return B_BUSY;
374 		// we'll be freed later when the 2MSL timer expires
375 }
376 
377 
378 /*!
379 	Creates and sends a synchronize packet to /a address, and then waits
380 	until the connection has been established or refused.
381 */
382 status_t
383 TCPEndpoint::Connect(const sockaddr *address)
384 {
385 	TRACE("Connect() on address %s", PrintAddress(address));
386 
387 	MutexLocker locker(fLock);
388 
389 	// Can only call connect() from CLOSED or LISTEN states
390 	// otherwise endpoint is considered already connected
391 	if (fState == LISTEN) {
392 		// this socket is about to connect; remove pending connections in the backlog
393 		gSocketModule->set_max_backlog(socket, 0);
394 	} else if (fState != CLOSED)
395 		return EISCONN;
396 
397 	status_t status = _PrepareSendPath(address);
398 	if (status < B_OK)
399 		return status;
400 
401 	TRACE("  Connect(): starting 3-way handshake...");
402 
403 	fState = SYNCHRONIZE_SENT;
404 
405 	// send SYN
406 	status = _SendQueued();
407 	if (status != B_OK) {
408 		fState = CLOSED;
409 		return status;
410 	}
411 
412 	// If we are running over Loopback, after _SendQueued() returns we
413 	// may be in ESTABLISHED already.
414 	if (fState == ESTABLISHED) {
415 		TRACE("  Connect() completed after _SendQueued()");
416 		return B_OK;
417 	}
418 
419 	// wait until 3-way handshake is complete (if needed)
420 	bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT);
421 	if (timeout == 0) {
422 		// we're a non-blocking socket
423 		TRACE("  Connect() delayed, return EINPROGRESS");
424 		return EINPROGRESS;
425 	}
426 
427 	status = _WaitForEstablished(locker, absolute_timeout(timeout));
428 	TRACE("  Connect(): Connection complete: %s (timeout was %llu)",
429 		strerror(status), timeout);
430 	return posix_error(status);
431 }
432 
433 
434 status_t
435 TCPEndpoint::Accept(struct net_socket **_acceptedSocket)
436 {
437 	TRACE("Accept()");
438 
439 	MutexLocker locker(fLock);
440 
441 	status_t status;
442 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
443 
444 	do {
445 		locker.Unlock();
446 
447 		status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT
448 			| B_CAN_INTERRUPT, timeout);
449 		if (status < B_OK)
450 			return status;
451 
452 		locker.Lock();
453 		status = gSocketModule->dequeue_connected(socket, _acceptedSocket);
454 		if (status == B_OK)
455 			TRACE("  Accept() returning %p", (*_acceptedSocket)->first_protocol);
456 	} while (status < B_OK);
457 
458 	return status;
459 }
460 
461 
462 status_t
463 TCPEndpoint::Bind(const sockaddr *address)
464 {
465 	if (address == NULL)
466 		return B_BAD_VALUE;
467 
468 	MutexLocker lock(fLock);
469 
470 	TRACE("Bind() on address %s", PrintAddress(address));
471 
472 	if (fState != CLOSED)
473 		return EISCONN;
474 
475 	return fManager->Bind(this, address);
476 }
477 
478 
479 status_t
480 TCPEndpoint::Unbind(struct sockaddr *address)
481 {
482 	TRACE("Unbind()");
483 
484 	MutexLocker lock(fLock);
485 	return fManager->Unbind(this);
486 }
487 
488 
489 status_t
490 TCPEndpoint::Listen(int count)
491 {
492 	TRACE("Listen()");
493 
494 	MutexLocker lock(fLock);
495 
496 	if (fState != CLOSED)
497 		return B_BAD_VALUE;
498 
499 	fAcceptSemaphore = create_sem(0, "tcp accept");
500 	if (fAcceptSemaphore < B_OK)
501 		return ENOBUFS;
502 
503 	status_t status = fManager->SetPassive(this);
504 	if (status < B_OK) {
505 		delete_sem(fAcceptSemaphore);
506 		fAcceptSemaphore = -1;
507 		return status;
508 	}
509 
510 	fState = LISTEN;
511 	return B_OK;
512 }
513 
514 
515 status_t
516 TCPEndpoint::Shutdown(int direction)
517 {
518 	TRACE("Shutdown(%i)", direction);
519 
520 	MutexLocker lock(fLock);
521 
522 	if (direction == SHUT_RD || direction == SHUT_RDWR)
523 		fFlags |= FLAG_NO_RECEIVE;
524 
525 	if (direction == SHUT_WR || direction == SHUT_RDWR)
526 		_ShutdownEgress(false);
527 
528 	return B_OK;
529 }
530 
531 
532 /*!
533 	Puts data contained in \a buffer into send buffer
534 */
535 status_t
536 TCPEndpoint::SendData(net_buffer *buffer)
537 {
538 	MutexLocker lock(fLock);
539 
540 	TRACE("SendData(buffer %p, size %lu, flags %lx) [total %lu bytes, has %lu]",
541 		  buffer, buffer->size, buffer->flags, fSendQueue.Size(),
542 		  fSendQueue.Free());
543 
544 	if (fState == CLOSED)
545 		return ENOTCONN;
546 	else if (fState == LISTEN) {
547 		return EDESTADDRREQ;
548 	} else if (fState == FINISH_SENT || fState == FINISH_ACKNOWLEDGED
549 				|| fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE
550 				|| fState == TIME_WAIT) {
551 		// TODO: send SIGPIPE signal to app?
552 		return EPIPE;
553 	}
554 
555 	if (buffer->size > 0) {
556 		if (buffer->size > fSendQueue.Size())
557 			return EMSGSIZE;
558 
559 		bigtime_t timeout = absolute_timeout(socket->send.timeout);
560 
561 		while (fSendQueue.Free() < buffer->size) {
562 			status_t status = fSendList.Wait(lock, timeout);
563 			if (status < B_OK) {
564 				TRACE("  SendData() returning %s (%d)",
565 					strerror(posix_error(status)), (int)posix_error(status));
566 				return posix_error(status);
567 			}
568 		}
569 
570 		fSendQueue.Add(buffer);
571 	}
572 
573 	TRACE("  SendData(): %lu bytes used.", fSendQueue.Used());
574 
575 	if (fState == ESTABLISHED || fState == FINISH_RECEIVED)
576 		_SendQueued();
577 
578 	return B_OK;
579 }
580 
581 
582 ssize_t
583 TCPEndpoint::SendAvailable()
584 {
585 	MutexLocker locker(fLock);
586 
587 	ssize_t available;
588 
589 	if (is_writable(fState))
590 		available = fSendQueue.Free();
591 	else
592 		available = EPIPE;
593 
594 	TRACE("SendAvailable(): %li", available);
595 	return available;
596 }
597 
598 
599 status_t
600 TCPEndpoint::FillStat(net_stat *stat)
601 {
602 	MutexLocker _(fLock);
603 
604 	strlcpy(stat->state, name_for_state(fState), sizeof(stat->state));
605 	stat->receive_queue_size = fReceiveQueue.Available();
606 	stat->send_queue_size = fSendQueue.Used();
607 
608 	return B_OK;
609 }
610 
611 
612 status_t
613 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer)
614 {
615 	TRACE("ReadData(%lu bytes, flags 0x%x)", numBytes, (unsigned int)flags);
616 
617 	MutexLocker locker(fLock);
618 
619 	*_buffer = NULL;
620 
621 	if (fState == CLOSED)
622 		return ENOTCONN;
623 
624 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
625 
626 	if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) {
627 		if (flags & MSG_DONTWAIT)
628 			return B_WOULD_BLOCK;
629 
630 		status_t status = _WaitForEstablished(locker, timeout);
631 		if (status < B_OK)
632 			return posix_error(status);
633 	}
634 
635 	size_t dataNeeded = socket->receive.low_water_mark;
636 
637 	// When MSG_WAITALL is set then the function should block
638 	// until the full amount of data can be returned.
639 	if (flags & MSG_WAITALL)
640 		dataNeeded = numBytes;
641 
642 	// TODO: add support for urgent data (MSG_OOB)
643 
644 	while (true) {
645 		if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE
646 				|| fState == TIME_WAIT) {
647 			// ``Connection closing''.
648 			return B_OK;
649 		}
650 
651 		if (fReceiveQueue.Available() > 0) {
652 			if (fReceiveQueue.Available() >= dataNeeded ||
653 				((fReceiveQueue.PushedData() > 0)
654 					&& (fReceiveQueue.PushedData() >= fReceiveQueue.Available())))
655 				break;
656 		} else if (fState == FINISH_RECEIVED) {
657 			// ``If no text is awaiting delivery, the RECEIVE will
658 			//   get a Connection closing''.
659 			return B_OK;
660 		}
661 
662 		if ((flags & MSG_DONTWAIT) || (socket->receive.timeout == 0))
663 			return B_WOULD_BLOCK;
664 
665 		status_t status = fReceiveList.Wait(locker, timeout, false);
666 		if (status < B_OK) {
667 			// The Open Group base specification mentions that EINTR should be
668 			// returned if the recv() is interrupted before _any data_ is
669 			// available. So we actually check if there is data, and if so,
670 			// push it to the user.
671 			if ((status == B_TIMED_OUT || status == B_INTERRUPTED)
672 					&& fReceiveQueue.Available() > 0)
673 				break;
674 
675 			return posix_error(status);
676 		}
677 	}
678 
679 	TRACE("  ReadData(): %lu are available.", fReceiveQueue.Available());
680 
681 	if (numBytes < fReceiveQueue.Available())
682 		fReceiveList.Signal();
683 
684 	bool clone = (flags & MSG_PEEK);
685 
686 	ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer);
687 
688 	TRACE("  ReadData(): %lu bytes kept.", fReceiveQueue.Available());
689 
690 	// if we are opening the window, check if we should send an ACK
691 	if (!clone)
692 		SendAcknowledge(false);
693 
694 	return receivedBytes;
695 }
696 
697 
698 ssize_t
699 TCPEndpoint::ReadAvailable()
700 {
701 	MutexLocker locker(fLock);
702 
703 	TRACE("ReadAvailable(): %li", _AvailableData());
704 
705 	return _AvailableData();
706 }
707 
708 
709 status_t
710 TCPEndpoint::SetSendBufferSize(size_t length)
711 {
712 	MutexLocker _(fLock);
713 	fSendQueue.SetMaxBytes(length);
714 	return B_OK;
715 }
716 
717 
718 status_t
719 TCPEndpoint::SetReceiveBufferSize(size_t length)
720 {
721 	MutexLocker _(fLock);
722 	fReceiveQueue.SetMaxBytes(length);
723 	return B_OK;
724 }
725 
726 
727 status_t
728 TCPEndpoint::SetOption(int option, const void *_value, int length)
729 {
730 	if (option != TCP_NODELAY)
731 		return B_BAD_VALUE;
732 
733 	if (length != sizeof(int))
734 		return B_BAD_VALUE;
735 
736 	const int *value = (const int *)_value;
737 
738 	MutexLocker _(fLock);
739 	if (*value)
740 		fOptions |= TCP_NODELAY;
741 	else
742 		fOptions &= ~TCP_NODELAY;
743 
744 	return B_OK;
745 }
746 
747 
748 //	#pragma mark - misc
749 
750 
751 bool
752 TCPEndpoint::IsBound() const
753 {
754 	return !LocalAddress().IsEmpty(true);
755 }
756 
757 
758 void
759 TCPEndpoint::DeleteSocket()
760 {
761 	// the next call will delete `this'.
762 	gSocketModule->delete_socket(socket);
763 }
764 
765 
766 status_t
767 TCPEndpoint::DelayedAcknowledge()
768 {
769 	if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) {
770 		// timer was active, send an ACK now (with the exception above,
771 		// we send every other ACK)
772 		return SendAcknowledge(true);
773 	}
774 
775 	gStackModule->set_timer(&fDelayedAcknowledgeTimer, TCP_DELAYED_ACKNOWLEDGE_TIMEOUT);
776 	return B_OK;
777 }
778 
779 
780 status_t
781 TCPEndpoint::SendAcknowledge(bool force)
782 {
783 	return _SendQueued(force, 0);
784 }
785 
786 
787 void
788 TCPEndpoint::_StartPersistTimer()
789 {
790 	gStackModule->set_timer(&fPersistTimer, 1000000LL);
791 }
792 
793 
794 void
795 TCPEndpoint::_EnterTimeWait()
796 {
797 	gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1);
798 }
799 
800 
801 status_t
802 TCPEndpoint::UpdateTimeWait()
803 {
804 	return B_OK;
805 }
806 
807 
808 //	#pragma mark - receive
809 
810 
811 int32
812 TCPEndpoint::_ListenReceive(tcp_segment_header &segment, net_buffer *buffer)
813 {
814 	TRACE("ListenReceive()");
815 
816 	// Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state,
817 	// but the error behaviour differs
818 	if (segment.flags & TCP_FLAG_RESET)
819 		return DROP;
820 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
821 		return DROP | RESET;
822 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
823 		return DROP;
824 
825 	// TODO: drop broadcast/multicast
826 
827 	// spawn new endpoint for accept()
828 	net_socket *newSocket;
829 	if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK)
830 		return DROP;
831 
832 	return ((TCPEndpoint *)newSocket->first_protocol)->Spawn(this,
833 		segment, buffer);
834 }
835 
836 
837 int32
838 TCPEndpoint::Spawn(TCPEndpoint *parent, tcp_segment_header &segment,
839 	net_buffer *buffer)
840 {
841 	MutexLocker _(fLock);
842 
843 	// TODO error checking
844 	ProtocolSocket::Open();
845 
846 	fState = SYNCHRONIZE_RECEIVED;
847 	fManager = parent->fManager;
848 
849 	LocalAddress().SetTo(buffer->destination);
850 	PeerAddress().SetTo(buffer->source);
851 
852 	TRACE("Spawn()");
853 
854 	// TODO: proper error handling!
855 	if (fManager->BindChild(this) < B_OK)
856 		return DROP;
857 
858 	if (_PrepareSendPath(*PeerAddress()) < B_OK)
859 		return DROP;
860 
861 	fOptions = parent->fOptions;
862 	fAcceptSemaphore = parent->fAcceptSemaphore;
863 
864 	_PrepareReceivePath(segment);
865 
866 	// send SYN+ACK
867 	if (_SendQueued() < B_OK)
868 		return DROP;
869 
870 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
871 		// we handled this flag now, it must not be set for further processing
872 
873 	return _Receive(segment, buffer);
874 }
875 
876 
877 void
878 TCPEndpoint::DumpInternalState() const
879 {
880 	kprintf("Lock: { sem: %ld, holder: %ld }\n", fLock.sem, fLock.holder);
881 	kprintf("AcceptSem: %ld\n", fAcceptSemaphore);
882 	kprintf("Options: 0x%lx\n", (uint32)fOptions);
883 	kprintf("SendWindowShift: %lu\n", (uint32)fSendWindowShift);
884 	kprintf("ReceiveWindowShift: %lu\n", (uint32)fReceiveWindowShift);
885 	kprintf("SendUnacknowledged: %lu\n", (uint32)fSendUnacknowledged);
886 	kprintf("SendNext: %lu\n", (uint32)fSendNext);
887 	kprintf("SendMax: %lu\n", (uint32)fSendMax);
888 	kprintf("SendWindow: %lu\n", fSendWindow);
889 	kprintf("SendMaxWindow: %lu\n", fSendMaxWindow);
890 	kprintf("SendMaxSegmentSize: %lu\n", fSendMaxSegmentSize);
891 	kprintf("Send-Q: %lu / %lu\n", fSendQueue.Used(), fSendQueue.Size());
892 	kprintf("LastAcknowledgeSent: %lu\n", (uint32)fLastAcknowledgeSent);
893 	kprintf("InitialSendSequence: %lu\n", (uint32)fInitialSendSequence);
894 	kprintf("DuplicateAcknowledgeCount: %lu\n", fDuplicateAcknowledgeCount);
895 	kprintf("ReceiveNext: %lu\n", (uint32)fReceiveNext);
896 	kprintf("ReceiveMaxAdvertised: %lu\n", (uint32)fReceiveMaxAdvertised);
897 	kprintf("ReceiveWindow: %lu\n", (uint32)fReceiveWindow);
898 	kprintf("ReceiveMaxSegmentSize: %lu\n", (uint32)fReceiveMaxSegmentSize);
899 	kprintf("Recv-Q: %lu / %lu\n", fReceiveQueue.Available(),
900 		fReceiveQueue.Size());
901 	kprintf("InitialReceiveSequence: %lu\n", (uint32)fInitialReceiveSequence);
902 	kprintf("RoundTripTime: %ld (dev %ld)\n", fRoundTripTime,
903 		fRoundTripDeviation);
904 	kprintf("RetransmitTimeout: %llu\n", (uint64)fRetransmitTimeout);
905 	kprintf("CongestionWindow: %lu\n", fCongestionWindow);
906 	kprintf("SlowStartThreshold: %lu\n", fSlowStartThreshold);
907 	kprintf("State: %s\n", name_for_state(fState));
908 	kprintf("Flags: 0x%lx\n", fFlags);
909 }
910 
911 
912 int32
913 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment, net_buffer *buffer)
914 {
915 	TRACE("SynchronizeSentReceive()");
916 
917 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
918 		&& (fInitialSendSequence >= segment.acknowledge
919 			|| fSendMax < segment.acknowledge))
920 		return DROP | RESET;
921 
922 	if (segment.flags & TCP_FLAG_RESET) {
923 		fError = ECONNREFUSED;
924 		fState = CLOSED;
925 		return DROP;
926 	}
927 
928 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
929 		return DROP;
930 
931 	fSendUnacknowledged = segment.acknowledge;
932 	_PrepareReceivePath(segment);
933 
934 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE) {
935 		_MarkEstablished();
936 	} else {
937 		// simultaneous open
938 		fState = SYNCHRONIZE_RECEIVED;
939 	}
940 
941 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
942 		// we handled this flag now, it must not be set for further processing
943 
944 	return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE;
945 }
946 
947 
948 int32
949 TCPEndpoint::SegmentReceived(tcp_segment_header &segment, net_buffer *buffer)
950 {
951 	MutexLocker locker(fLock);
952 
953 	TRACE("SegmentReceived(): buffer %p (%lu bytes) address %s to %s",
954 		buffer, buffer->size, PrintAddress(buffer->source),
955 		PrintAddress(buffer->destination));
956 	TRACE("                   flags 0x%x, seq %lu, ack %lu, wnd %lu",
957 		segment.flags, segment.sequence, segment.acknowledge,
958 		(uint32)segment.advertised_window << fSendWindowShift);
959 
960 	int32 segmentAction = DROP;
961 
962 	switch (fState) {
963 		case LISTEN:
964 			segmentAction = _ListenReceive(segment, buffer);
965 			break;
966 
967 		case SYNCHRONIZE_SENT:
968 			segmentAction = _SynchronizeSentReceive(segment, buffer);
969 			break;
970 
971 		case SYNCHRONIZE_RECEIVED:
972 		case ESTABLISHED:
973 		case FINISH_RECEIVED:
974 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
975 		case FINISH_SENT:
976 		case FINISH_ACKNOWLEDGED:
977 		case CLOSING:
978 		case TIME_WAIT:
979 		case CLOSED:
980 			segmentAction = _SegmentReceived(segment, buffer);
981 			break;
982 	}
983 
984 	// process acknowledge action as asked for by the *Receive() method
985 	if (segmentAction & IMMEDIATE_ACKNOWLEDGE)
986 		SendAcknowledge(true);
987 	else if (segmentAction & ACKNOWLEDGE)
988 		DelayedAcknowledge();
989 
990 	return segmentAction;
991 }
992 
993 int32
994 TCPEndpoint::_SegmentReceived(tcp_segment_header &segment, net_buffer *buffer)
995 {
996 	uint32 advertisedWindow = (uint32)segment.advertised_window << fSendWindowShift;
997 
998 	// First, handle the most common case for uni-directional data transfer
999 	// (known as header prediction - the segment must not change the window,
1000 	// and must be the expected sequence, and contain no control flags)
1001 
1002 	if (fState == ESTABLISHED
1003 		&& segment.AcknowledgeOnly()
1004 		&& fReceiveNext == segment.sequence
1005 		&& advertisedWindow > 0 && advertisedWindow == fSendWindow
1006 		&& fSendNext == fSendMax) {
1007 
1008 		_UpdateTimestamps(segment, buffer->size);
1009 
1010 		if (buffer->size == 0) {
1011 			// this is a pure acknowledge segment - we're on the sending end
1012 			if (fSendUnacknowledged < segment.acknowledge
1013 				&& fSendMax >= segment.acknowledge) {
1014 				_Acknowledged(segment);
1015 				return DROP;
1016 			}
1017 		} else if (segment.acknowledge == fSendUnacknowledged
1018 			&& fReceiveQueue.IsContiguous()
1019 			&& fReceiveQueue.Free() >= buffer->size
1020 			&& !(fFlags & FLAG_NO_RECEIVE)) {
1021 			if (_AddData(segment, buffer))
1022 				_NotifyReader();
1023 			return KEEP | ((segment.flags & TCP_FLAG_PUSH) ?
1024 				IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE);
1025 		}
1026 	}
1027 
1028 	// The fast path was not applicable, so we continue with the standard
1029 	// processing of the incoming segment
1030 
1031 	if (fState != SYNCHRONIZE_SENT && fState != LISTEN && fState != CLOSED) {
1032 		// 1. check sequence number
1033 		if (!segment_in_sequence(segment, buffer->size, fReceiveNext,
1034 				fReceiveWindow)) {
1035 			TRACE("  Receive(): segment out of window, next: %lu wnd: %lu",
1036 				(uint32)fReceiveNext, fReceiveWindow);
1037 			if (segment.flags & TCP_FLAG_RESET)
1038 				return DROP;
1039 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1040 		}
1041 	}
1042 
1043 	return _Receive(segment, buffer);
1044 }
1045 
1046 
1047 //	#pragma mark - send
1048 
1049 
1050 inline uint8
1051 TCPEndpoint::_CurrentFlags()
1052 {
1053 	// we don't set FLAG_FINISH here, instead we do it
1054 	// conditionally below depending if we are sending
1055 	// the last bytes of the send queue.
1056 
1057 	switch (fState) {
1058 		case CLOSED:
1059 			return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE;
1060 
1061 		case SYNCHRONIZE_SENT:
1062 			return TCP_FLAG_SYNCHRONIZE;
1063 		case SYNCHRONIZE_RECEIVED:
1064 			return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE;
1065 
1066 		case ESTABLISHED:
1067 		case FINISH_RECEIVED:
1068 		case FINISH_ACKNOWLEDGED:
1069 		case TIME_WAIT:
1070 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1071 		case FINISH_SENT:
1072 		case CLOSING:
1073 			return TCP_FLAG_ACKNOWLEDGE;
1074 
1075 		default:
1076 			return B_ERROR;
1077 	}
1078 }
1079 
1080 
1081 inline bool
1082 TCPEndpoint::_ShouldSendSegment(tcp_segment_header &segment, uint32 length,
1083 	uint32 segmentMaxSize, uint32 flightSize)
1084 {
1085 	if (length > 0) {
1086 		// Avoid the silly window syndrome - we only send a segment in case:
1087 		// - we have a full segment to send, or
1088 		// - we're at the end of our buffer queue, or
1089 		// - the buffer is at least larger than half of the maximum send window, or
1090 		// - we're retransmitting data
1091 		if (length == segmentMaxSize
1092 			|| (fOptions & TCP_NODELAY) != 0
1093 			|| tcp_sequence(fSendNext + length) == fSendQueue.LastSequence()
1094 			|| (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2))
1095 			return true;
1096 	}
1097 
1098 	// check if we need to send a window update to the peer
1099 	if (segment.advertised_window > 0) {
1100 		// correct the window to take into account what already has been advertised
1101 		uint32 window = (segment.advertised_window << fReceiveWindowShift)
1102 			- (fReceiveMaxAdvertised - fReceiveNext);
1103 
1104 		// if we can advertise a window larger than twice the maximum segment
1105 		// size, or half the maximum buffer size we send a window update
1106 		if (window >= (fReceiveMaxSegmentSize << 1)
1107 			|| window >= (socket->receive.buffer_size >> 1))
1108 			return true;
1109 	}
1110 
1111 	if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH | TCP_FLAG_RESET)) != 0)
1112 		return true;
1113 
1114 	// there is no reason to send a segment just now
1115 	return false;
1116 }
1117 
1118 
1119 status_t
1120 TCPEndpoint::_SendQueued(bool force)
1121 {
1122 	return _SendQueued(force, fSendWindow);
1123 }
1124 
1125 
1126 /*!
1127 	Sends one or more TCP segments with the data waiting in the queue, or some
1128 	specific flags that need to be sent.
1129 */
1130 status_t
1131 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow)
1132 {
1133 	if (fRoute == NULL)
1134 		return B_ERROR;
1135 
1136 	// in passive state?
1137 	if (fState == LISTEN)
1138 		return B_ERROR;
1139 
1140 	tcp_segment_header segment(_CurrentFlags());
1141 
1142 	if ((fOptions & TCP_NOOPT) == 0) {
1143 		if (fFlags & FLAG_OPTION_TIMESTAMP) {
1144 			segment.options |= TCP_HAS_TIMESTAMPS;
1145 			segment.timestamp_reply = fReceivedTimestamp;
1146 			segment.timestamp_value = tcp_now();
1147 		}
1148 
1149 		if ((segment.flags & TCP_FLAG_SYNCHRONIZE)
1150 			&& (fSendNext == fInitialSendSequence)) {
1151 			// add connection establishment options
1152 			segment.max_segment_size = fReceiveMaxSegmentSize;
1153 			if (fFlags & FLAG_OPTION_WINDOW_SCALE) {
1154 				segment.options |= TCP_HAS_WINDOW_SCALE;
1155 				segment.window_shift = fReceiveWindowShift;
1156 			}
1157 		}
1158 	}
1159 
1160 	size_t availableBytes = fReceiveQueue.Free();
1161 	if (fFlags & FLAG_OPTION_WINDOW_SCALE)
1162 		segment.advertised_window = availableBytes >> fReceiveWindowShift;
1163 	else
1164 		segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes);
1165 
1166 	segment.acknowledge = fReceiveNext;
1167 	segment.urgent_offset = 0;
1168 
1169 	if (fCongestionWindow > 0 && fCongestionWindow < sendWindow)
1170 		sendWindow = fCongestionWindow;
1171 
1172 	// SND.UNA  SND.NXT        SND.MAX
1173 	//  |        |              |
1174 	//  v        v              v
1175 	//  -----------------------------------
1176 	//  | effective window           |
1177 	//  -----------------------------------
1178 
1179 	// Flight size represents the window of data which is currently in the
1180 	// ether. We should never send data such as the flight size becomes larger
1181 	// than the effective window. Note however that the effective window may be
1182 	// reduced (by congestion for instance), so at some point in time flight
1183 	// size may be larger than the currently calculated window.
1184 
1185 	uint32 flightSize = fSendMax - fSendUnacknowledged;
1186 	uint32 consumedWindow = fSendNext - fSendUnacknowledged;
1187 
1188 	if (consumedWindow > sendWindow) {
1189 		sendWindow = 0;
1190 		// TODO enter persist state? try to get a window update.
1191 	} else
1192 		sendWindow -= consumedWindow;
1193 
1194 	if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) {
1195 		// send one byte of data to ask for a window update
1196 		// (triggered by the persist timer)
1197 		sendWindow = 1;
1198 	}
1199 
1200 	uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow);
1201 	tcp_sequence previousSendNext = fSendNext;
1202 
1203 	do {
1204 		uint32 segmentMaxSize = fSendMaxSegmentSize
1205 			- tcp_options_length(segment);
1206 		uint32 segmentLength = min_c(length, segmentMaxSize);
1207 
1208 		if (fSendNext + segmentLength == fSendQueue.LastSequence()) {
1209 			if (state_needs_finish(fState))
1210 				segment.flags |= TCP_FLAG_FINISH;
1211 			if (length > 0)
1212 				segment.flags |= TCP_FLAG_PUSH;
1213 		}
1214 
1215 		// Determine if we should really send this segment
1216 		if (!force && !_ShouldSendSegment(segment, segmentLength,
1217 			segmentMaxSize, flightSize)) {
1218 			if (fSendQueue.Available()
1219 				&& !gStackModule->is_timer_active(&fPersistTimer)
1220 				&& !gStackModule->is_timer_active(&fRetransmitTimer))
1221 				_StartPersistTimer();
1222 			break;
1223 		}
1224 
1225 		net_buffer *buffer = gBufferModule->create(256);
1226 		if (buffer == NULL)
1227 			return B_NO_MEMORY;
1228 
1229 		status_t status = B_OK;
1230 		if (segmentLength > 0)
1231 			fSendQueue.Get(buffer, fSendNext, segmentLength);
1232 		if (status < B_OK) {
1233 			gBufferModule->free(buffer);
1234 			return status;
1235 		}
1236 
1237 		LocalAddress().CopyTo(buffer->source);
1238 		PeerAddress().CopyTo(buffer->destination);
1239 
1240 		uint32 size = buffer->size;
1241 		segment.sequence = fSendNext;
1242 
1243 		TRACE("SendQueued(): buffer %p (%lu bytes) address %s to %s",
1244 			buffer, buffer->size, PrintAddress(buffer->source),
1245 			PrintAddress(buffer->destination));
1246 		TRACE("              flags 0x%x, seq %lu, ack %lu, rwnd %hu, cwnd %lu"
1247 			", ssthresh %lu", segment.flags, segment.sequence,
1248 			segment.acknowledge, segment.advertised_window,
1249 			fCongestionWindow, fSlowStartThreshold);
1250 		TRACE("              len %lu first %lu last %lu", segmentLength,
1251 			(uint32)fSendQueue.FirstSequence(),
1252 			(uint32)fSendQueue.LastSequence());
1253 
1254 		PROBE(buffer, sendWindow);
1255 		sendWindow -= buffer->size;
1256 
1257 		status = add_tcp_header(AddressModule(), segment, buffer);
1258 		if (status != B_OK) {
1259 			gBufferModule->free(buffer);
1260 			return status;
1261 		}
1262 
1263 		// Update send status - we need to do this before we send the data
1264 		// for local connections as the answer is directly handled
1265 
1266 		if (segment.flags & TCP_FLAG_SYNCHRONIZE) {
1267 			segment.options &= ~TCP_HAS_WINDOW_SCALE;
1268 			segment.max_segment_size = 0;
1269 			size++;
1270 		}
1271 
1272 		if (segment.flags & TCP_FLAG_FINISH)
1273 			size++;
1274 
1275 		uint32 sendMax = fSendMax;
1276 		fSendNext += size;
1277 		if (fSendMax < fSendNext)
1278 			fSendMax = fSendNext;
1279 
1280 		fReceiveMaxAdvertised = fReceiveNext
1281 			+ ((uint32)segment.advertised_window << fReceiveWindowShift);
1282 
1283 		status = next->module->send_routed_data(next, fRoute, buffer);
1284 		if (status < B_OK) {
1285 			gBufferModule->free(buffer);
1286 
1287 			fSendNext = segment.sequence;
1288 			fSendMax = sendMax;
1289 				// restore send status
1290 			return status;
1291 		}
1292 
1293 		if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
1294 			fLastAcknowledgeSent = segment.acknowledge;
1295 
1296 		length -= segmentLength;
1297 		segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET | TCP_FLAG_FINISH);
1298 	} while (length > 0);
1299 
1300 	// if we sent data from the beggining of the send queue,
1301 	// start the retransmition timer
1302 	if (previousSendNext == fSendUnacknowledged
1303 		&& fSendNext > previousSendNext) {
1304 		TRACE("  SendQueue(): set retransmit timer with rto %llu",
1305 			fRetransmitTimeout);
1306 
1307 		gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
1308 	}
1309 
1310 	return B_OK;
1311 }
1312 
1313 
1314 int
1315 TCPEndpoint::_GetMSS(const sockaddr *address) const
1316 {
1317 	return next->module->get_mtu(next, address) - sizeof(tcp_header);
1318 }
1319 
1320 
1321 status_t
1322 TCPEndpoint::_ShutdownEgress(bool closing)
1323 {
1324 	tcp_state previousState = fState;
1325 
1326 	if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED)
1327 		fState = FINISH_SENT;
1328 	else if (fState == FINISH_RECEIVED)
1329 		fState = WAIT_FOR_FINISH_ACKNOWLEDGE;
1330 	else
1331 		return B_OK;
1332 
1333 	status_t status = _SendQueued();
1334 	if (status != B_OK) {
1335 		fState = previousState;
1336 		return status;
1337 	}
1338 
1339 	return B_OK;
1340 }
1341 
1342 
1343 ssize_t
1344 TCPEndpoint::_AvailableData() const
1345 {
1346 	// TODO: Refer to the FLAG_NO_RECEIVE comment above regarding
1347 	//       the application of FLAG_NO_RECEIVE in listen()ing
1348 	//       sockets.
1349 	if (fState == LISTEN)
1350 		return gSocketModule->count_connected(socket);
1351 	else if (fState == SYNCHRONIZE_SENT)
1352 		return 0;
1353 
1354 	ssize_t availableData = fReceiveQueue.Available();
1355 
1356 	if (availableData == 0 && !_ShouldReceive())
1357 		return ENOTCONN;
1358 
1359 	return availableData;
1360 }
1361 
1362 
1363 void
1364 TCPEndpoint::_NotifyReader()
1365 {
1366 	fReceiveList.Signal();
1367 	gSocketModule->notify(socket, B_SELECT_READ, _AvailableData());
1368 }
1369 
1370 
1371 bool
1372 TCPEndpoint::_ShouldReceive() const
1373 {
1374 	if (fFlags & FLAG_NO_RECEIVE)
1375 		return false;
1376 
1377 	return fState == ESTABLISHED || fState == FINISH_SENT
1378 			|| fState == FINISH_ACKNOWLEDGED;
1379 }
1380 
1381 
1382 int32
1383 TCPEndpoint::_Receive(tcp_segment_header &segment, net_buffer *buffer)
1384 {
1385 	uint32 advertisedWindow = (uint32)segment.advertised_window << fSendWindowShift;
1386 
1387 	size_t segmentLength = buffer->size;
1388 
1389 	if (segment.flags & TCP_FLAG_RESET) {
1390 		// is this a valid reset?
1391 		if (fLastAcknowledgeSent <= segment.sequence
1392 			&& tcp_sequence(segment.sequence)
1393 				< (fLastAcknowledgeSent + fReceiveWindow)) {
1394 			if (fState == SYNCHRONIZE_RECEIVED)
1395 				fError = ECONNREFUSED;
1396 			else if (fState == CLOSING || fState == TIME_WAIT
1397 					|| fState == WAIT_FOR_FINISH_ACKNOWLEDGE)
1398 				fError = ENOTCONN;
1399 			else
1400 				fError = ECONNRESET;
1401 
1402 			_NotifyReader();
1403 			fState = CLOSED;
1404 		}
1405 
1406 		return DROP;
1407 	}
1408 
1409 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1410 		|| (fState == SYNCHRONIZE_RECEIVED
1411 			&& (fInitialReceiveSequence > segment.sequence
1412 				|| (segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1413 					&& (fSendUnacknowledged > segment.acknowledge
1414 						|| fSendMax < segment.acknowledge)))) {
1415 		// reset the connection - either the initial SYN was faulty, or we
1416 		// received a SYN within the data stream
1417 		return DROP | RESET;
1418 	}
1419 
1420 	fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow);
1421 		// the window must not shrink
1422 
1423 	// trim buffer to be within the receive window
1424 	int32 drop = fReceiveNext - segment.sequence;
1425 	if (drop > 0) {
1426 		if ((uint32)drop > buffer->size
1427 			|| ((uint32)drop == buffer->size
1428 				&& (segment.flags & TCP_FLAG_FINISH) == 0)) {
1429 			// don't accidently remove a FIN we shouldn't remove
1430 			segment.flags &= ~TCP_FLAG_FINISH;
1431 			drop = buffer->size;
1432 		}
1433 
1434 		// remove duplicate data at the start
1435 		TRACE("* remove %ld bytes from the start", drop);
1436 		gBufferModule->remove_header(buffer, drop);
1437 		segment.sequence += drop;
1438 	}
1439 
1440 	int32 action = KEEP;
1441 
1442 	drop = segment.sequence + buffer->size - (fReceiveNext + fReceiveWindow);
1443 	if (drop > 0) {
1444 		// remove data exceeding our window
1445 		if ((uint32)drop >= buffer->size) {
1446 			// if we can accept data, or the segment is not what we'd expect,
1447 			// drop the segment (an immediate acknowledge is always triggered)
1448 			if (fReceiveWindow != 0 || segment.sequence != fReceiveNext)
1449 				return DROP | IMMEDIATE_ACKNOWLEDGE;
1450 
1451 			action |= IMMEDIATE_ACKNOWLEDGE;
1452 		}
1453 
1454 		if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1455 			// we need to remove the finish, too, as part of the data
1456 			drop--;
1457 		}
1458 
1459 		segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH);
1460 		TRACE("* remove %ld bytes from the end", drop);
1461 		gBufferModule->remove_trailer(buffer, drop);
1462 	}
1463 
1464 	if (advertisedWindow > fSendWindow)
1465 		TRACE("  Receive(): Window update %lu -> %lu", fSendWindow,
1466 			advertisedWindow);
1467 
1468 	fSendWindow = advertisedWindow;
1469 	if (advertisedWindow > fSendMaxWindow)
1470 		fSendMaxWindow = advertisedWindow;
1471 
1472 	// Then look at the acknowledgement for any updates
1473 
1474 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) {
1475 		// process acknowledged data
1476 		if (fState == SYNCHRONIZE_RECEIVED)
1477 			_MarkEstablished();
1478 
1479 		if (fSendMax < segment.acknowledge || fState == TIME_WAIT)
1480 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1481 
1482 		if (segment.acknowledge < fSendUnacknowledged) {
1483 			if (buffer->size == 0 && advertisedWindow == fSendWindow
1484 				&& (segment.flags & TCP_FLAG_FINISH) == 0) {
1485 				TRACE("Receive(): duplicate ack!");
1486 
1487 				_DuplicateAcknowledge(segment);
1488 			}
1489 
1490 			return DROP;
1491 		} else {
1492 			// this segment acknowledges in flight data
1493 
1494 			if (fDuplicateAcknowledgeCount >= 3) {
1495 				// deflate the window.
1496 				fCongestionWindow = fSlowStartThreshold;
1497 			}
1498 
1499 			fDuplicateAcknowledgeCount = 0;
1500 
1501 			if (fSendMax == segment.acknowledge)
1502 				TRACE("Receive(): all inflight data ack'd!");
1503 
1504 			if (segment.acknowledge > fSendQueue.LastSequence()
1505 					&& fState > ESTABLISHED) {
1506 				TRACE("Receive(): FIN has been acknowledged!");
1507 
1508 				switch (fState) {
1509 					case FINISH_SENT:
1510 						fState = FINISH_ACKNOWLEDGED;
1511 						break;
1512 					case CLOSING:
1513 						fState = TIME_WAIT;
1514 						_EnterTimeWait();
1515 						return DROP;
1516 					case WAIT_FOR_FINISH_ACKNOWLEDGE:
1517 						fState = CLOSED;
1518 						break;
1519 
1520 					default:
1521 						break;
1522 				}
1523 			}
1524 
1525 			if (fState != CLOSED)
1526 				_Acknowledged(segment);
1527 		}
1528 	}
1529 
1530 	if (segment.flags & TCP_FLAG_URGENT) {
1531 		if (fState == ESTABLISHED || fState == FINISH_SENT
1532 			|| fState == FINISH_ACKNOWLEDGED) {
1533 			// TODO: Handle urgent data:
1534 			//  - RCV.UP <- max(RCV.UP, SEG.UP)
1535 			//  - signal the user that urgent data is available (SIGURG)
1536 		}
1537 	}
1538 
1539 	bool notify = false;
1540 
1541 	if (buffer->size > 0 &&	_ShouldReceive())
1542 		notify = _AddData(segment, buffer);
1543 	else
1544 		action = (action & ~KEEP) | DROP;
1545 
1546 	if (segment.flags & TCP_FLAG_FINISH) {
1547 		segmentLength++;
1548 		if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) {
1549 			TRACE("Receive(): peer is finishing connection!");
1550 			fReceiveNext++;
1551 			notify = true;
1552 
1553 			// FIN implies PSH
1554 			fReceiveQueue.SetPushPointer();
1555 
1556 			// we'll reply immediatly to the FIN if we are not
1557 			// transitioning to TIME WAIT so we immediatly ACK it.
1558 			action |= IMMEDIATE_ACKNOWLEDGE;
1559 
1560 			// other side is closing connection; change states
1561 			switch (fState) {
1562 				case ESTABLISHED:
1563 				case SYNCHRONIZE_RECEIVED:
1564 					fState = FINISH_RECEIVED;
1565 					break;
1566 				case FINISH_SENT:
1567 					// simultaneous close
1568 					fState = CLOSING;
1569 					break;
1570 				case FINISH_ACKNOWLEDGED:
1571 					fState = TIME_WAIT;
1572 					_EnterTimeWait();
1573 					break;
1574 				default:
1575 					break;
1576 			}
1577 		}
1578 	}
1579 
1580 	if (notify)
1581 		_NotifyReader();
1582 
1583 	if (buffer->size > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)
1584 		action |= ACKNOWLEDGE;
1585 
1586 	_UpdateTimestamps(segment, segmentLength);
1587 
1588 	TRACE("Receive() Action %ld", action);
1589 
1590 	return action;
1591 }
1592 
1593 
1594 void
1595 TCPEndpoint::_UpdateTimestamps(tcp_segment_header &segment,
1596 	size_t segmentLength)
1597 {
1598 	if (fFlags & FLAG_OPTION_TIMESTAMP) {
1599 		tcp_sequence sequence(segment.sequence);
1600 
1601 		if ((fLastAcknowledgeSent >= sequence
1602 				&& fLastAcknowledgeSent < (sequence + segmentLength)))
1603 			fReceivedTimestamp = segment.timestamp_value;
1604 	}
1605 }
1606 
1607 
1608 void
1609 TCPEndpoint::_MarkEstablished()
1610 {
1611 	fState = ESTABLISHED;
1612 
1613 	if (socket->parent != NULL) {
1614 		gSocketModule->set_connected(socket);
1615 		release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE);
1616 	}
1617 
1618 	fSendList.Signal();
1619 }
1620 
1621 
1622 status_t
1623 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout)
1624 {
1625 	while (fState != ESTABLISHED) {
1626 		status_t status = fSendList.Wait(locker, timeout);
1627 		if (status < B_OK)
1628 			return status;
1629 	}
1630 
1631 	return B_OK;
1632 }
1633 
1634 
1635 bool
1636 TCPEndpoint::_AddData(tcp_segment_header &segment, net_buffer *buffer)
1637 {
1638 	fReceiveQueue.Add(buffer, segment.sequence);
1639 	fReceiveNext = fReceiveQueue.NextSequence();
1640 
1641 	TRACE("  _AddData(): adding data, receive next = %lu. Now have %lu bytes.",
1642 		(uint32)fReceiveNext, fReceiveQueue.Available());
1643 
1644 	if (segment.flags & TCP_FLAG_PUSH)
1645 		fReceiveQueue.SetPushPointer();
1646 
1647 	return fReceiveQueue.Available() > 0;
1648 }
1649 
1650 
1651 void
1652 TCPEndpoint::_PrepareReceivePath(tcp_segment_header &segment)
1653 {
1654 	fInitialReceiveSequence = segment.sequence;
1655 
1656 	// count the received SYN
1657 	segment.sequence++;
1658 
1659 	fReceiveNext = segment.sequence;
1660 	fReceiveQueue.SetInitialSequence(segment.sequence);
1661 
1662 	if ((fOptions & TCP_NOOPT) == 0) {
1663 		if (segment.max_segment_size > 0)
1664 			fSendMaxSegmentSize = segment.max_segment_size;
1665 
1666 		if (segment.options & TCP_HAS_WINDOW_SCALE) {
1667 			fFlags |= FLAG_OPTION_WINDOW_SCALE;
1668 			fSendWindowShift = segment.window_shift;
1669 		} else {
1670 			fFlags &= ~FLAG_OPTION_WINDOW_SCALE;
1671 			fReceiveWindowShift = 0;
1672 		}
1673 
1674 		if (segment.options & TCP_HAS_TIMESTAMPS) {
1675 			fFlags |= FLAG_OPTION_TIMESTAMP;
1676 			fReceivedTimestamp = segment.timestamp_value;
1677 		} else
1678 			fFlags &= ~FLAG_OPTION_TIMESTAMP;
1679 	}
1680 
1681 	fCongestionWindow = 2 * fSendMaxSegmentSize;
1682 	fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift;
1683 }
1684 
1685 
1686 status_t
1687 TCPEndpoint::_PrepareSendPath(const sockaddr *peer)
1688 {
1689 	if (fRoute == NULL) {
1690 		fRoute = gDatalinkModule->get_route(Domain(), peer);
1691 		if (fRoute == NULL)
1692 			return ENETUNREACH;
1693 	}
1694 
1695 	// make sure connection does not already exist
1696 	status_t status = fManager->SetConnection(this, *LocalAddress(), peer,
1697 		fRoute->interface->address);
1698 	if (status < B_OK)
1699 		return status;
1700 
1701 	fInitialSendSequence = system_time() >> 4;
1702 	fSendNext = fInitialSendSequence;
1703 	fSendUnacknowledged = fInitialSendSequence;
1704 	fSendMax = fInitialSendSequence;
1705 
1706 	// we are counting the SYN here
1707 	fSendQueue.SetInitialSequence(fSendNext + 1);
1708 
1709 	fReceiveMaxSegmentSize = _GetMSS(peer);
1710 
1711 	// Compute the window shift we advertise to our peer - if it doesn't support
1712 	// this option, this will be reset to 0 (when its SYN is received)
1713 	fReceiveWindowShift = 0;
1714 	while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT
1715 		&& (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) {
1716 		fReceiveWindowShift++;
1717 	}
1718 
1719 	return B_OK;
1720 }
1721 
1722 
1723 void
1724 TCPEndpoint::_Acknowledged(tcp_segment_header &segment)
1725 {
1726 	size_t previouslyUsed = fSendQueue.Used();
1727 
1728 	fSendQueue.RemoveUntil(segment.acknowledge);
1729 	fSendUnacknowledged = segment.acknowledge;
1730 
1731 	if (fSendNext < fSendUnacknowledged)
1732 		fSendNext = fSendUnacknowledged;
1733 
1734 	if (fSendUnacknowledged == fSendMax)
1735 		gStackModule->cancel_timer(&fRetransmitTimer);
1736 
1737 	if (fSendQueue.Used() < previouslyUsed) {
1738 		// this ACK acknowledged data
1739 
1740 		if (segment.options & TCP_HAS_TIMESTAMPS)
1741 			_UpdateSRTT(tcp_diff_timestamp(segment.timestamp_reply));
1742 		else {
1743 			// TODO Fallback to RFC 793 type estimation
1744 		}
1745 
1746 		if (is_writable(fState)) {
1747 			// notify threads waiting on the socket to become writable again
1748 			fSendList.Signal();
1749 			gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Used());
1750 		}
1751 
1752 		if (fCongestionWindow < fSlowStartThreshold)
1753 			fCongestionWindow += fSendMaxSegmentSize;
1754 	}
1755 
1756 	if (fCongestionWindow >= fSlowStartThreshold) {
1757 		uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize;
1758 
1759 		if (increment < fCongestionWindow)
1760 			increment = 1;
1761 		else
1762 			increment /= fCongestionWindow;
1763 
1764 		fCongestionWindow += increment;
1765 	}
1766 
1767 	// if there is data left to be send, send it now
1768 	if (fSendQueue.Used() > 0)
1769 		_SendQueued();
1770 }
1771 
1772 
1773 void
1774 TCPEndpoint::_Retransmit()
1775 {
1776 	TRACE("Retransmit()");
1777 	_ResetSlowStart();
1778 	fSendNext = fSendUnacknowledged;
1779 	_SendQueued();
1780 }
1781 
1782 
1783 void
1784 TCPEndpoint::_UpdateSRTT(int32 roundTripTime)
1785 {
1786 	int32 rtt = roundTripTime;
1787 
1788 	// Update_SRTT() as per Van Jacobson
1789 	rtt -= (fRoundTripTime / 8);
1790 	fRoundTripTime += rtt;
1791 	if (rtt < 0)
1792 		rtt = -rtt;
1793 	rtt -= (fRoundTripDeviation / 4);
1794 	fRoundTripDeviation += rtt;
1795 
1796 	fRetransmitTimeout = ((fRoundTripTime / 4 +
1797 		fRoundTripDeviation) / 2) * kTimestampFactor;
1798 
1799 	TRACE("  RTO is now %llu (after rtt %ldms)", fRetransmitTimeout,
1800 		roundTripTime);
1801 }
1802 
1803 
1804 void
1805 TCPEndpoint::_ResetSlowStart()
1806 {
1807 	fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged) / 2,
1808 		2 * fSendMaxSegmentSize);
1809 	fCongestionWindow = fSendMaxSegmentSize;
1810 }
1811 
1812 
1813 void
1814 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment)
1815 {
1816 	fDuplicateAcknowledgeCount++;
1817 
1818 	if (fDuplicateAcknowledgeCount < 3)
1819 		return;
1820 	else if (fDuplicateAcknowledgeCount == 3) {
1821 		_ResetSlowStart();
1822 		fCongestionWindow = fSlowStartThreshold + 3
1823 			* fSendMaxSegmentSize;
1824 		fSendNext = segment.acknowledge;
1825 	} else if (fDuplicateAcknowledgeCount > 3)
1826 		fCongestionWindow += fSendMaxSegmentSize;
1827 
1828 	_SendQueued();
1829 }
1830 
1831 
1832 //	#pragma mark - timer
1833 
1834 
1835 /*static*/ void
1836 TCPEndpoint::_RetransmitTimer(net_timer *timer, void *data)
1837 {
1838 	TCPEndpoint *endpoint = (TCPEndpoint *)data;
1839 
1840 	MutexLocker locker(endpoint->fLock);
1841 	if (!locker.IsLocked())
1842 		return;
1843 
1844 	endpoint->_Retransmit();
1845 }
1846 
1847 
1848 /*static*/ void
1849 TCPEndpoint::_PersistTimer(net_timer *timer, void *data)
1850 {
1851 	TCPEndpoint *endpoint = (TCPEndpoint *)data;
1852 
1853 	MutexLocker locker(endpoint->fLock);
1854 	if (!locker.IsLocked())
1855 		return;
1856 
1857 	endpoint->_SendQueued(true);
1858 }
1859 
1860 
1861 /*static*/ void
1862 TCPEndpoint::_DelayedAcknowledgeTimer(struct net_timer *timer, void *data)
1863 {
1864 	TCPEndpoint *endpoint = (TCPEndpoint *)data;
1865 
1866 	MutexLocker locker(endpoint->fLock);
1867 	if (!locker.IsLocked())
1868 		return;
1869 
1870 	endpoint->SendAcknowledge(true);
1871 }
1872 
1873 
1874 /*static*/ void
1875 TCPEndpoint::_TimeWaitTimer(struct net_timer *timer, void *data)
1876 {
1877 	TCPEndpoint *endpoint = (TCPEndpoint *)data;
1878 
1879 	if (mutex_lock(&endpoint->fLock) < B_OK)
1880 		return;
1881 
1882 	endpoint->DeleteSocket();
1883 }
1884 
1885